blob: 1fcc83e63a3240e4405109e91e224c3ddcb9a72e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300293#include "clinic/unicodeobject.c.h"
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527 to keep things simple, we use a single bitmask, using the least 5
528 bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
Antoine Pitrouf068f942010-01-13 14:19:12 +0000532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542#define BLOOM_MASK unsigned long
543
Serhiy Storchaka05997252013-01-26 12:14:02 +0200544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Benjamin Peterson29060642009-01-31 22:14:21 +0000548#define BLOOM_LINEBREAK(ch) \
549 ((ch) < 128U ? ascii_linebreak[(ch)] : \
550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Alexander Belopolsky40018472011-02-26 01:02:56 +0000552Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554{
Victor Stinnera85af502013-04-09 21:53:54 +0200555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
556 do { \
557 TYPE *data = (TYPE *)PTR; \
558 TYPE *end = data + LEN; \
559 Py_UCS4 ch; \
560 for (; data != end; data++) { \
561 ch = *data; \
562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563 } \
564 break; \
565 } while (0)
566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* calculate simple bloom-style bitmask for a given unicode string */
568
Antoine Pitrouf068f942010-01-13 14:19:12 +0000569 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200572 switch (kind) {
573 case PyUnicode_1BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575 break;
576 case PyUnicode_2BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578 break;
579 case PyUnicode_4BYTE_KIND:
580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581 break;
582 default:
583 assert(0);
584 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200586
587#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000588}
589
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200590/* Compilation of templated routines */
591
592#include "stringlib/asciilib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs1lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300608#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200609#include "stringlib/find_max_char.h"
610#include "stringlib/localeutil.h"
611#include "stringlib/undef.h"
612
613#include "stringlib/ucs2lib.h"
614#include "stringlib/fastsearch.h"
615#include "stringlib/partition.h"
616#include "stringlib/split.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300619#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200620#include "stringlib/find_max_char.h"
621#include "stringlib/localeutil.h"
622#include "stringlib/undef.h"
623
624#include "stringlib/ucs4lib.h"
625#include "stringlib/fastsearch.h"
626#include "stringlib/partition.h"
627#include "stringlib/split.h"
628#include "stringlib/count.h"
629#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300630#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200631#include "stringlib/find_max_char.h"
632#include "stringlib/localeutil.h"
633#include "stringlib/undef.h"
634
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635#include "stringlib/unicodedefs.h"
636#include "stringlib/fastsearch.h"
637#include "stringlib/count.h"
638#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100639#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200640
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641/* --- Unicode Object ----------------------------------------------------- */
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200644fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200646Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 Py_ssize_t size, Py_UCS4 ch,
648 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
651
652 switch (kind) {
653 case PyUnicode_1BYTE_KIND:
654 {
655 Py_UCS1 ch1 = (Py_UCS1) ch;
656 if (ch1 == ch)
657 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
658 else
659 return -1;
660 }
661 case PyUnicode_2BYTE_KIND:
662 {
663 Py_UCS2 ch2 = (Py_UCS2) ch;
664 if (ch2 == ch)
665 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
666 else
667 return -1;
668 }
669 case PyUnicode_4BYTE_KIND:
670 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
671 default:
672 assert(0);
673 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675}
676
Victor Stinnerafffce42012-10-03 23:03:17 +0200677#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000678/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200679 earlier.
680
681 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
682 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
683 invalid character in Unicode 6.0. */
684static void
685unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
686{
687 int kind = PyUnicode_KIND(unicode);
688 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
689 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
690 if (length <= old_length)
691 return;
692 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
693}
694#endif
695
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696static PyObject*
697resize_compact(PyObject *unicode, Py_ssize_t length)
698{
699 Py_ssize_t char_size;
700 Py_ssize_t struct_size;
701 Py_ssize_t new_size;
702 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100703 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200704#ifdef Py_DEBUG
705 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
706#endif
707
Victor Stinner79891572012-05-03 13:43:07 +0200708 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100710 assert(PyUnicode_IS_COMPACT(unicode));
711
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200714 struct_size = sizeof(PyASCIIObject);
715 else
716 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200717 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
720 PyErr_NoMemory();
721 return NULL;
722 }
723 new_size = (struct_size + (length + 1) * char_size);
724
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200725 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
726 PyObject_DEL(_PyUnicode_UTF8(unicode));
727 _PyUnicode_UTF8(unicode) = NULL;
728 _PyUnicode_UTF8_LENGTH(unicode) = 0;
729 }
Victor Stinner84def372011-12-11 20:04:56 +0100730 _Py_DEC_REFTOTAL;
731 _Py_ForgetReference(unicode);
732
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300733 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100734 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100735 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 PyErr_NoMemory();
737 return NULL;
738 }
Victor Stinner84def372011-12-11 20:04:56 +0100739 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100741
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100745 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200746 _PyUnicode_WSTR_LENGTH(unicode) = length;
747 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100748 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
749 PyObject_DEL(_PyUnicode_WSTR(unicode));
750 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100751 if (!PyUnicode_IS_ASCII(unicode))
752 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100753 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200754#ifdef Py_DEBUG
755 unicode_fill_invalid(unicode, old_length);
756#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200757 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
758 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200759 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 return unicode;
761}
762
Alexander Belopolsky40018472011-02-26 01:02:56 +0000763static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200764resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765{
Victor Stinner95663112011-10-04 01:03:50 +0200766 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100767 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200768 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000770
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 if (PyUnicode_IS_READY(unicode)) {
772 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200773 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200775#ifdef Py_DEBUG
776 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
777#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200780 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200781 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
782 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200783
784 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
785 PyErr_NoMemory();
786 return -1;
787 }
788 new_size = (length + 1) * char_size;
789
Victor Stinner7a9105a2011-12-12 00:13:42 +0100790 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
791 {
792 PyObject_DEL(_PyUnicode_UTF8(unicode));
793 _PyUnicode_UTF8(unicode) = NULL;
794 _PyUnicode_UTF8_LENGTH(unicode) = 0;
795 }
796
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797 data = (PyObject *)PyObject_REALLOC(data, new_size);
798 if (data == NULL) {
799 PyErr_NoMemory();
800 return -1;
801 }
802 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200803 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200805 _PyUnicode_WSTR_LENGTH(unicode) = length;
806 }
807 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200808 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200809 _PyUnicode_UTF8_LENGTH(unicode) = length;
810 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 _PyUnicode_LENGTH(unicode) = length;
812 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200813#ifdef Py_DEBUG
814 unicode_fill_invalid(unicode, old_length);
815#endif
Victor Stinner95663112011-10-04 01:03:50 +0200816 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200817 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200819 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200820 }
Victor Stinner95663112011-10-04 01:03:50 +0200821 assert(_PyUnicode_WSTR(unicode) != NULL);
822
823 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700824 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200825 PyErr_NoMemory();
826 return -1;
827 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100828 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200829 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100830 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200831 if (!wstr) {
832 PyErr_NoMemory();
833 return -1;
834 }
835 _PyUnicode_WSTR(unicode) = wstr;
836 _PyUnicode_WSTR(unicode)[length] = 0;
837 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200838 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000839 return 0;
840}
841
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842static PyObject*
843resize_copy(PyObject *unicode, Py_ssize_t length)
844{
845 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100846 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200847 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100848
Benjamin Petersonbac79492012-01-14 13:34:47 -0500849 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100850 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200851
852 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
853 if (copy == NULL)
854 return NULL;
855
856 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200857 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200858 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200859 }
860 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200861 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100862
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200863 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200864 if (w == NULL)
865 return NULL;
866 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
867 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200868 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
869 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200870 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200871 }
872}
873
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000875 Ux0000 terminated; some code (e.g. new_identifier)
876 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000877
878 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000879 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880
881*/
882
Alexander Belopolsky40018472011-02-26 01:02:56 +0000883static PyUnicodeObject *
884_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200886 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888
Thomas Wouters477c8d52006-05-27 19:21:47 +0000889 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 if (length == 0 && unicode_empty != NULL) {
891 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200892 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 }
894
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000895 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700896 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000897 return (PyUnicodeObject *)PyErr_NoMemory();
898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899 if (length < 0) {
900 PyErr_SetString(PyExc_SystemError,
901 "Negative size passed to _PyUnicode_New");
902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 }
904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
906 if (unicode == NULL)
907 return NULL;
908 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100909
910 _PyUnicode_WSTR_LENGTH(unicode) = length;
911 _PyUnicode_HASH(unicode) = -1;
912 _PyUnicode_STATE(unicode).interned = 0;
913 _PyUnicode_STATE(unicode).kind = 0;
914 _PyUnicode_STATE(unicode).compact = 0;
915 _PyUnicode_STATE(unicode).ready = 0;
916 _PyUnicode_STATE(unicode).ascii = 0;
917 _PyUnicode_DATA_ANY(unicode) = NULL;
918 _PyUnicode_LENGTH(unicode) = 0;
919 _PyUnicode_UTF8(unicode) = NULL;
920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
923 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100924 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000925 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100926 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928
Jeremy Hyltond8082792003-09-16 19:41:39 +0000929 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000930 * the caller fails before initializing str -- unicode_resize()
931 * reads str[0], and the Keep-Alive optimization can keep memory
932 * allocated for str alive across a call to unicode_dealloc(unicode).
933 * We don't want unicode_resize to read uninitialized memory in
934 * that case.
935 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936 _PyUnicode_WSTR(unicode)[0] = 0;
937 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100938
Victor Stinner7931d9a2011-11-04 00:22:48 +0100939 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940 return unicode;
941}
942
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943static const char*
944unicode_kind_name(PyObject *unicode)
945{
Victor Stinner42dfd712011-10-03 14:41:45 +0200946 /* don't check consistency: unicode_kind_name() is called from
947 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 if (!PyUnicode_IS_COMPACT(unicode))
949 {
950 if (!PyUnicode_IS_READY(unicode))
951 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600952 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200953 {
954 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200955 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200956 return "legacy ascii";
957 else
958 return "legacy latin1";
959 case PyUnicode_2BYTE_KIND:
960 return "legacy UCS2";
961 case PyUnicode_4BYTE_KIND:
962 return "legacy UCS4";
963 default:
964 return "<legacy invalid kind>";
965 }
966 }
967 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600968 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 return "ascii";
972 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200973 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200974 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200975 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200976 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200977 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200978 default:
979 return "<invalid compact kind>";
980 }
981}
982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984/* Functions wrapping macros for use in debugger */
985char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200986 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987}
988
989void *_PyUnicode_compact_data(void *unicode) {
990 return _PyUnicode_COMPACT_DATA(unicode);
991}
992void *_PyUnicode_data(void *unicode){
993 printf("obj %p\n", unicode);
994 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
995 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
996 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
997 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
998 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
999 return PyUnicode_DATA(unicode);
1000}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001001
1002void
1003_PyUnicode_Dump(PyObject *op)
1004{
1005 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001006 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1007 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1008 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001009
Victor Stinnera849a4b2011-10-03 12:12:11 +02001010 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001011 {
1012 if (ascii->state.ascii)
1013 data = (ascii + 1);
1014 else
1015 data = (compact + 1);
1016 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001017 else
1018 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001019 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1020 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001021
Victor Stinnera849a4b2011-10-03 12:12:11 +02001022 if (ascii->wstr == data)
1023 printf("shared ");
1024 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001025
Victor Stinnera3b334d2011-10-03 13:53:37 +02001026 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001027 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001028 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1029 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001030 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1031 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001032 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001033 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001034}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035#endif
1036
1037PyObject *
1038PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1039{
1040 PyObject *obj;
1041 PyCompactUnicodeObject *unicode;
1042 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001043 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001044 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 Py_ssize_t char_size;
1046 Py_ssize_t struct_size;
1047
1048 /* Optimization for empty strings */
1049 if (size == 0 && unicode_empty != NULL) {
1050 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001051 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 }
1053
Victor Stinner9e9d6892011-10-04 01:02:02 +02001054 is_ascii = 0;
1055 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 struct_size = sizeof(PyCompactUnicodeObject);
1057 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001058 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 char_size = 1;
1060 is_ascii = 1;
1061 struct_size = sizeof(PyASCIIObject);
1062 }
1063 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001064 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 char_size = 1;
1066 }
1067 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001068 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 char_size = 2;
1070 if (sizeof(wchar_t) == 2)
1071 is_sharing = 1;
1072 }
1073 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001074 if (maxchar > MAX_UNICODE) {
1075 PyErr_SetString(PyExc_SystemError,
1076 "invalid maximum character passed to PyUnicode_New");
1077 return NULL;
1078 }
Victor Stinner8f825062012-04-27 13:55:39 +02001079 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080 char_size = 4;
1081 if (sizeof(wchar_t) == 4)
1082 is_sharing = 1;
1083 }
1084
1085 /* Ensure we won't overflow the size. */
1086 if (size < 0) {
1087 PyErr_SetString(PyExc_SystemError,
1088 "Negative size passed to PyUnicode_New");
1089 return NULL;
1090 }
1091 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1092 return PyErr_NoMemory();
1093
1094 /* Duplicated allocation code from _PyObject_New() instead of a call to
1095 * PyObject_New() so we are able to allocate space for the object and
1096 * it's data buffer.
1097 */
1098 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1099 if (obj == NULL)
1100 return PyErr_NoMemory();
1101 obj = PyObject_INIT(obj, &PyUnicode_Type);
1102 if (obj == NULL)
1103 return NULL;
1104
1105 unicode = (PyCompactUnicodeObject *)obj;
1106 if (is_ascii)
1107 data = ((PyASCIIObject*)obj) + 1;
1108 else
1109 data = unicode + 1;
1110 _PyUnicode_LENGTH(unicode) = size;
1111 _PyUnicode_HASH(unicode) = -1;
1112 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001113 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 _PyUnicode_STATE(unicode).compact = 1;
1115 _PyUnicode_STATE(unicode).ready = 1;
1116 _PyUnicode_STATE(unicode).ascii = is_ascii;
1117 if (is_ascii) {
1118 ((char*)data)[size] = 0;
1119 _PyUnicode_WSTR(unicode) = NULL;
1120 }
Victor Stinner8f825062012-04-27 13:55:39 +02001121 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122 ((char*)data)[size] = 0;
1123 _PyUnicode_WSTR(unicode) = NULL;
1124 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001126 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 else {
1129 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001130 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001131 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001133 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 ((Py_UCS4*)data)[size] = 0;
1135 if (is_sharing) {
1136 _PyUnicode_WSTR_LENGTH(unicode) = size;
1137 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1138 }
1139 else {
1140 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1141 _PyUnicode_WSTR(unicode) = NULL;
1142 }
1143 }
Victor Stinner8f825062012-04-27 13:55:39 +02001144#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001145 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001146#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001147 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148 return obj;
1149}
1150
1151#if SIZEOF_WCHAR_T == 2
1152/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1153 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001154 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155
1156 This function assumes that unicode can hold one more code point than wstr
1157 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001158static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001160 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161{
1162 const wchar_t *iter;
1163 Py_UCS4 *ucs4_out;
1164
Victor Stinner910337b2011-10-03 03:20:16 +02001165 assert(unicode != NULL);
1166 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1168 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1169
1170 for (iter = begin; iter < end; ) {
1171 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1172 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001173 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1174 && (iter+1) < end
1175 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 {
Victor Stinner551ac952011-11-29 22:58:13 +01001177 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 iter += 2;
1179 }
1180 else {
1181 *ucs4_out++ = *iter;
1182 iter++;
1183 }
1184 }
1185 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1186 _PyUnicode_GET_LENGTH(unicode)));
1187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188}
1189#endif
1190
Victor Stinnercd9950f2011-10-02 00:34:53 +02001191static int
Victor Stinner488fa492011-12-12 00:01:39 +01001192unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001193{
Victor Stinner488fa492011-12-12 00:01:39 +01001194 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001195 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001196 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001197 return -1;
1198 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001199 return 0;
1200}
1201
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202static int
1203_copy_characters(PyObject *to, Py_ssize_t to_start,
1204 PyObject *from, Py_ssize_t from_start,
1205 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001206{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001207 unsigned int from_kind, to_kind;
1208 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinneree4544c2012-05-09 22:24:08 +02001210 assert(0 <= how_many);
1211 assert(0 <= from_start);
1212 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001213 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001214 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001215 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216
Victor Stinnerd3f08822012-05-29 12:57:52 +02001217 assert(PyUnicode_Check(to));
1218 assert(PyUnicode_IS_READY(to));
1219 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1220
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001221 if (how_many == 0)
1222 return 0;
1223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001225 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001227 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228
Victor Stinnerf1852262012-06-16 16:38:26 +02001229#ifdef Py_DEBUG
1230 if (!check_maxchar
1231 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1232 {
1233 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1234 Py_UCS4 ch;
1235 Py_ssize_t i;
1236 for (i=0; i < how_many; i++) {
1237 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1238 assert(ch <= to_maxchar);
1239 }
1240 }
1241#endif
1242
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001243 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001244 if (check_maxchar
1245 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1246 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 /* Writing Latin-1 characters into an ASCII string requires to
1248 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001249 Py_UCS4 max_char;
1250 max_char = ucs1lib_find_max_char(from_data,
1251 (Py_UCS1*)from_data + how_many);
1252 if (max_char >= 128)
1253 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001254 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001255 Py_MEMCPY((char*)to_data + to_kind * to_start,
1256 (char*)from_data + from_kind * from_start,
1257 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001259 else if (from_kind == PyUnicode_1BYTE_KIND
1260 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 {
1262 _PyUnicode_CONVERT_BYTES(
1263 Py_UCS1, Py_UCS2,
1264 PyUnicode_1BYTE_DATA(from) + from_start,
1265 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1266 PyUnicode_2BYTE_DATA(to) + to_start
1267 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001268 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001269 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001270 && to_kind == PyUnicode_4BYTE_KIND)
1271 {
1272 _PyUnicode_CONVERT_BYTES(
1273 Py_UCS1, Py_UCS4,
1274 PyUnicode_1BYTE_DATA(from) + from_start,
1275 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1276 PyUnicode_4BYTE_DATA(to) + to_start
1277 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001278 }
1279 else if (from_kind == PyUnicode_2BYTE_KIND
1280 && to_kind == PyUnicode_4BYTE_KIND)
1281 {
1282 _PyUnicode_CONVERT_BYTES(
1283 Py_UCS2, Py_UCS4,
1284 PyUnicode_2BYTE_DATA(from) + from_start,
1285 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1286 PyUnicode_4BYTE_DATA(to) + to_start
1287 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001288 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001289 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001290 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1291
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001292 if (!check_maxchar) {
1293 if (from_kind == PyUnicode_2BYTE_KIND
1294 && to_kind == PyUnicode_1BYTE_KIND)
1295 {
1296 _PyUnicode_CONVERT_BYTES(
1297 Py_UCS2, Py_UCS1,
1298 PyUnicode_2BYTE_DATA(from) + from_start,
1299 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1300 PyUnicode_1BYTE_DATA(to) + to_start
1301 );
1302 }
1303 else if (from_kind == PyUnicode_4BYTE_KIND
1304 && to_kind == PyUnicode_1BYTE_KIND)
1305 {
1306 _PyUnicode_CONVERT_BYTES(
1307 Py_UCS4, Py_UCS1,
1308 PyUnicode_4BYTE_DATA(from) + from_start,
1309 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1310 PyUnicode_1BYTE_DATA(to) + to_start
1311 );
1312 }
1313 else if (from_kind == PyUnicode_4BYTE_KIND
1314 && to_kind == PyUnicode_2BYTE_KIND)
1315 {
1316 _PyUnicode_CONVERT_BYTES(
1317 Py_UCS4, Py_UCS2,
1318 PyUnicode_4BYTE_DATA(from) + from_start,
1319 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1320 PyUnicode_2BYTE_DATA(to) + to_start
1321 );
1322 }
1323 else {
1324 assert(0);
1325 return -1;
1326 }
1327 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001328 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001329 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001330 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001331 Py_ssize_t i;
1332
Victor Stinnera0702ab2011-09-29 14:14:38 +02001333 for (i=0; i < how_many; i++) {
1334 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001335 if (ch > to_maxchar)
1336 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001337 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1338 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001339 }
1340 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341 return 0;
1342}
1343
Victor Stinnerd3f08822012-05-29 12:57:52 +02001344void
1345_PyUnicode_FastCopyCharacters(
1346 PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001348{
1349 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1350}
1351
1352Py_ssize_t
1353PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1354 PyObject *from, Py_ssize_t from_start,
1355 Py_ssize_t how_many)
1356{
1357 int err;
1358
1359 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1360 PyErr_BadInternalCall();
1361 return -1;
1362 }
1363
Benjamin Petersonbac79492012-01-14 13:34:47 -05001364 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001365 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001366 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001367 return -1;
1368
Victor Stinnerd3f08822012-05-29 12:57:52 +02001369 if (from_start < 0) {
1370 PyErr_SetString(PyExc_IndexError, "string index out of range");
1371 return -1;
1372 }
1373 if (to_start < 0) {
1374 PyErr_SetString(PyExc_IndexError, "string index out of range");
1375 return -1;
1376 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001377 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1378 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1379 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001380 "Cannot write %zi characters at %zi "
1381 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001382 how_many, to_start, PyUnicode_GET_LENGTH(to));
1383 return -1;
1384 }
1385
1386 if (how_many == 0)
1387 return 0;
1388
Victor Stinner488fa492011-12-12 00:01:39 +01001389 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001390 return -1;
1391
1392 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1393 if (err) {
1394 PyErr_Format(PyExc_SystemError,
1395 "Cannot copy %s characters "
1396 "into a string of %s characters",
1397 unicode_kind_name(from),
1398 unicode_kind_name(to));
1399 return -1;
1400 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001401 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402}
1403
Victor Stinner17222162011-09-28 22:15:37 +02001404/* Find the maximum code point and count the number of surrogate pairs so a
1405 correct string length can be computed before converting a string to UCS4.
1406 This function counts single surrogates as a character and not as a pair.
1407
1408 Return 0 on success, or -1 on error. */
1409static int
1410find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1411 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412{
1413 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001414 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
Victor Stinnerc53be962011-10-02 21:33:54 +02001416 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 *num_surrogates = 0;
1418 *maxchar = 0;
1419
1420 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001422 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1423 && (iter+1) < end
1424 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1425 {
1426 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1427 ++(*num_surrogates);
1428 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 }
1430 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001431#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001432 {
1433 ch = *iter;
1434 iter++;
1435 }
1436 if (ch > *maxchar) {
1437 *maxchar = ch;
1438 if (*maxchar > MAX_UNICODE) {
1439 PyErr_Format(PyExc_ValueError,
1440 "character U+%x is not in range [U+0000; U+10ffff]",
1441 ch);
1442 return -1;
1443 }
1444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 }
1446 return 0;
1447}
1448
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001449int
1450_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451{
1452 wchar_t *end;
1453 Py_UCS4 maxchar = 0;
1454 Py_ssize_t num_surrogates;
1455#if SIZEOF_WCHAR_T == 2
1456 Py_ssize_t length_wo_surrogates;
1457#endif
1458
Georg Brandl7597add2011-10-05 16:36:47 +02001459 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001460 strings were created using _PyObject_New() and where no canonical
1461 representation (the str field) has been set yet aka strings
1462 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001463 assert(_PyUnicode_CHECK(unicode));
1464 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001466 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001467 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001468 /* Actually, it should neither be interned nor be anything else: */
1469 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001472 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001473 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475
1476 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001477 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1478 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479 PyErr_NoMemory();
1480 return -1;
1481 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001482 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 _PyUnicode_WSTR(unicode), end,
1484 PyUnicode_1BYTE_DATA(unicode));
1485 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1486 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1487 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1488 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001489 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001490 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001491 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 }
1493 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001494 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001495 _PyUnicode_UTF8(unicode) = NULL;
1496 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 }
1498 PyObject_FREE(_PyUnicode_WSTR(unicode));
1499 _PyUnicode_WSTR(unicode) = NULL;
1500 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1501 }
1502 /* In this case we might have to convert down from 4-byte native
1503 wchar_t to 2-byte unicode. */
1504 else if (maxchar < 65536) {
1505 assert(num_surrogates == 0 &&
1506 "FindMaxCharAndNumSurrogatePairs() messed up");
1507
Victor Stinner506f5922011-09-28 22:34:18 +02001508#if SIZEOF_WCHAR_T == 2
1509 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001510 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001511 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1512 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1513 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001514 _PyUnicode_UTF8(unicode) = NULL;
1515 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001516#else
1517 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001518 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001519 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001520 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001521 PyErr_NoMemory();
1522 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 }
Victor Stinner506f5922011-09-28 22:34:18 +02001524 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1525 _PyUnicode_WSTR(unicode), end,
1526 PyUnicode_2BYTE_DATA(unicode));
1527 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1528 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1529 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001530 _PyUnicode_UTF8(unicode) = NULL;
1531 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001532 PyObject_FREE(_PyUnicode_WSTR(unicode));
1533 _PyUnicode_WSTR(unicode) = NULL;
1534 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1535#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 }
1537 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1538 else {
1539#if SIZEOF_WCHAR_T == 2
1540 /* in case the native representation is 2-bytes, we need to allocate a
1541 new normalized 4-byte version. */
1542 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001543 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1544 PyErr_NoMemory();
1545 return -1;
1546 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001547 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1548 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549 PyErr_NoMemory();
1550 return -1;
1551 }
1552 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1553 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001554 _PyUnicode_UTF8(unicode) = NULL;
1555 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001556 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1557 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001558 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 PyObject_FREE(_PyUnicode_WSTR(unicode));
1560 _PyUnicode_WSTR(unicode) = NULL;
1561 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1562#else
1563 assert(num_surrogates == 0);
1564
Victor Stinnerc3c74152011-10-02 20:39:55 +02001565 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001567 _PyUnicode_UTF8(unicode) = NULL;
1568 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1570#endif
1571 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1572 }
1573 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001574 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 return 0;
1576}
1577
Alexander Belopolsky40018472011-02-26 01:02:56 +00001578static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001579unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580{
Walter Dörwald16807132007-05-25 13:52:07 +00001581 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 case SSTATE_NOT_INTERNED:
1583 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001584
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 case SSTATE_INTERNED_MORTAL:
1586 /* revive dead object temporarily for DelItem */
1587 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001588 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 Py_FatalError(
1590 "deletion of interned string failed");
1591 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001592
Benjamin Peterson29060642009-01-31 22:14:21 +00001593 case SSTATE_INTERNED_IMMORTAL:
1594 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001595
Benjamin Peterson29060642009-01-31 22:14:21 +00001596 default:
1597 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001598 }
1599
Victor Stinner03490912011-10-03 23:45:12 +02001600 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001601 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001602 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001603 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001604 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1605 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001607 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608}
1609
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001610#ifdef Py_DEBUG
1611static int
1612unicode_is_singleton(PyObject *unicode)
1613{
1614 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1615 if (unicode == unicode_empty)
1616 return 1;
1617 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1618 {
1619 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1620 if (ch < 256 && unicode_latin1[ch] == unicode)
1621 return 1;
1622 }
1623 return 0;
1624}
1625#endif
1626
Alexander Belopolsky40018472011-02-26 01:02:56 +00001627static int
Victor Stinner488fa492011-12-12 00:01:39 +01001628unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001629{
Victor Stinner488fa492011-12-12 00:01:39 +01001630 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631 if (Py_REFCNT(unicode) != 1)
1632 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001633 if (_PyUnicode_HASH(unicode) != -1)
1634 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635 if (PyUnicode_CHECK_INTERNED(unicode))
1636 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001637 if (!PyUnicode_CheckExact(unicode))
1638 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001639#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001640 /* singleton refcount is greater than 1 */
1641 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001642#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001643 return 1;
1644}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001645
Victor Stinnerfe226c02011-10-03 03:52:20 +02001646static int
1647unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1648{
1649 PyObject *unicode;
1650 Py_ssize_t old_length;
1651
1652 assert(p_unicode != NULL);
1653 unicode = *p_unicode;
1654
1655 assert(unicode != NULL);
1656 assert(PyUnicode_Check(unicode));
1657 assert(0 <= length);
1658
Victor Stinner910337b2011-10-03 03:20:16 +02001659 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001660 old_length = PyUnicode_WSTR_LENGTH(unicode);
1661 else
1662 old_length = PyUnicode_GET_LENGTH(unicode);
1663 if (old_length == length)
1664 return 0;
1665
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001666 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001667 _Py_INCREF_UNICODE_EMPTY();
1668 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001670 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001671 return 0;
1672 }
1673
Victor Stinner488fa492011-12-12 00:01:39 +01001674 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001675 PyObject *copy = resize_copy(unicode, length);
1676 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001677 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001678 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001679 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001680 }
1681
Victor Stinnerfe226c02011-10-03 03:52:20 +02001682 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001683 PyObject *new_unicode = resize_compact(unicode, length);
1684 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001685 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001686 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001687 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001688 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001689 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001690}
1691
Alexander Belopolsky40018472011-02-26 01:02:56 +00001692int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001693PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001694{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001695 PyObject *unicode;
1696 if (p_unicode == NULL) {
1697 PyErr_BadInternalCall();
1698 return -1;
1699 }
1700 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001701 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001702 {
1703 PyErr_BadInternalCall();
1704 return -1;
1705 }
1706 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001707}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001708
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001709/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001710
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001711 WARNING: The function doesn't copy the terminating null character and
1712 doesn't check the maximum character (may write a latin1 character in an
1713 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001714static void
1715unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1716 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001717{
1718 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1719 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001720 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001721
1722 switch (kind) {
1723 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001724 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001725#ifdef Py_DEBUG
1726 if (PyUnicode_IS_ASCII(unicode)) {
1727 Py_UCS4 maxchar = ucs1lib_find_max_char(
1728 (const Py_UCS1*)str,
1729 (const Py_UCS1*)str + len);
1730 assert(maxchar < 128);
1731 }
1732#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001733 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001734 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001735 }
1736 case PyUnicode_2BYTE_KIND: {
1737 Py_UCS2 *start = (Py_UCS2 *)data + index;
1738 Py_UCS2 *ucs2 = start;
1739 assert(index <= PyUnicode_GET_LENGTH(unicode));
1740
Victor Stinner184252a2012-06-16 02:57:41 +02001741 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001742 *ucs2 = (Py_UCS2)*str;
1743
1744 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001745 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001746 }
1747 default: {
1748 Py_UCS4 *start = (Py_UCS4 *)data + index;
1749 Py_UCS4 *ucs4 = start;
1750 assert(kind == PyUnicode_4BYTE_KIND);
1751 assert(index <= PyUnicode_GET_LENGTH(unicode));
1752
Victor Stinner184252a2012-06-16 02:57:41 +02001753 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001754 *ucs4 = (Py_UCS4)*str;
1755
1756 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001757 }
1758 }
1759}
1760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761static PyObject*
1762get_latin1_char(unsigned char ch)
1763{
Victor Stinnera464fc12011-10-02 20:39:30 +02001764 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001766 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 if (!unicode)
1768 return NULL;
1769 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001770 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 unicode_latin1[ch] = unicode;
1772 }
1773 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001774 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775}
1776
Victor Stinner985a82a2014-01-03 12:53:47 +01001777static PyObject*
1778unicode_char(Py_UCS4 ch)
1779{
1780 PyObject *unicode;
1781
1782 assert(ch <= MAX_UNICODE);
1783
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001784 if (ch < 256)
1785 return get_latin1_char(ch);
1786
Victor Stinner985a82a2014-01-03 12:53:47 +01001787 unicode = PyUnicode_New(1, ch);
1788 if (unicode == NULL)
1789 return NULL;
1790 switch (PyUnicode_KIND(unicode)) {
1791 case PyUnicode_1BYTE_KIND:
1792 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1793 break;
1794 case PyUnicode_2BYTE_KIND:
1795 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1796 break;
1797 default:
1798 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1799 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1800 }
1801 assert(_PyUnicode_CheckConsistency(unicode, 1));
1802 return unicode;
1803}
1804
Alexander Belopolsky40018472011-02-26 01:02:56 +00001805PyObject *
1806PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001808 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 Py_UCS4 maxchar = 0;
1810 Py_ssize_t num_surrogates;
1811
1812 if (u == NULL)
1813 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001815 /* If the Unicode data is known at construction time, we can apply
1816 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001819 if (size == 0)
1820 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 /* Single character Unicode objects in the Latin-1 range are
1823 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001824 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 return get_latin1_char((unsigned char)*u);
1826
1827 /* If not empty and not single character, copy the Unicode data
1828 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001829 if (find_maxchar_surrogates(u, u + size,
1830 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 return NULL;
1832
Victor Stinner8faf8212011-12-08 22:14:11 +01001833 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 if (!unicode)
1835 return NULL;
1836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837 switch (PyUnicode_KIND(unicode)) {
1838 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001839 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1841 break;
1842 case PyUnicode_2BYTE_KIND:
1843#if Py_UNICODE_SIZE == 2
1844 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1845#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001846 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001847 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1848#endif
1849 break;
1850 case PyUnicode_4BYTE_KIND:
1851#if SIZEOF_WCHAR_T == 2
1852 /* This is the only case which has to process surrogates, thus
1853 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001854 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855#else
1856 assert(num_surrogates == 0);
1857 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1858#endif
1859 break;
1860 default:
1861 assert(0 && "Impossible state");
1862 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865}
1866
Alexander Belopolsky40018472011-02-26 01:02:56 +00001867PyObject *
1868PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001869{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001870 if (size < 0) {
1871 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001872 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001873 return NULL;
1874 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001875 if (u != NULL)
1876 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1877 else
1878 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001879}
1880
Alexander Belopolsky40018472011-02-26 01:02:56 +00001881PyObject *
1882PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001883{
1884 size_t size = strlen(u);
1885 if (size > PY_SSIZE_T_MAX) {
1886 PyErr_SetString(PyExc_OverflowError, "input too long");
1887 return NULL;
1888 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001889 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001890}
1891
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001892PyObject *
1893_PyUnicode_FromId(_Py_Identifier *id)
1894{
1895 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001896 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1897 strlen(id->string),
1898 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001899 if (!id->object)
1900 return NULL;
1901 PyUnicode_InternInPlace(&id->object);
1902 assert(!id->next);
1903 id->next = static_strings;
1904 static_strings = id;
1905 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001906 return id->object;
1907}
1908
1909void
1910_PyUnicode_ClearStaticStrings()
1911{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001912 _Py_Identifier *tmp, *s = static_strings;
1913 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001914 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001915 tmp = s->next;
1916 s->next = NULL;
1917 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001918 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001919 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001920}
1921
Benjamin Peterson0df54292012-03-26 14:50:32 -04001922/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001923
Victor Stinnerd3f08822012-05-29 12:57:52 +02001924PyObject*
1925_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001926{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001927 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001928 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001929 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001930#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001931 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001932#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001933 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001934 }
Victor Stinner785938e2011-12-11 20:09:03 +01001935 unicode = PyUnicode_New(size, 127);
1936 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001937 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001938 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1939 assert(_PyUnicode_CheckConsistency(unicode, 1));
1940 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001941}
1942
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001943static Py_UCS4
1944kind_maxchar_limit(unsigned int kind)
1945{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001946 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001947 case PyUnicode_1BYTE_KIND:
1948 return 0x80;
1949 case PyUnicode_2BYTE_KIND:
1950 return 0x100;
1951 case PyUnicode_4BYTE_KIND:
1952 return 0x10000;
1953 default:
1954 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001955 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001956 }
1957}
1958
Victor Stinnere6abb482012-05-02 01:15:40 +02001959Py_LOCAL_INLINE(Py_UCS4)
1960align_maxchar(Py_UCS4 maxchar)
1961{
1962 if (maxchar <= 127)
1963 return 127;
1964 else if (maxchar <= 255)
1965 return 255;
1966 else if (maxchar <= 65535)
1967 return 65535;
1968 else
1969 return MAX_UNICODE;
1970}
1971
Victor Stinner702c7342011-10-05 13:50:52 +02001972static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001973_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001974{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001976 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001977
Serhiy Storchaka678db842013-01-26 12:16:36 +02001978 if (size == 0)
1979 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001980 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001981 if (size == 1)
1982 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001983
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001984 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001985 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 if (!res)
1987 return NULL;
1988 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001989 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001991}
1992
Victor Stinnere57b1c02011-09-28 22:20:48 +02001993static PyObject*
1994_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995{
1996 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001997 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001998
Serhiy Storchaka678db842013-01-26 12:16:36 +02001999 if (size == 0)
2000 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002001 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002002 if (size == 1)
2003 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002004
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002005 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002006 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 if (!res)
2008 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002009 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002011 else {
2012 _PyUnicode_CONVERT_BYTES(
2013 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2014 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002015 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 return res;
2017}
2018
Victor Stinnere57b1c02011-09-28 22:20:48 +02002019static PyObject*
2020_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021{
2022 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002023 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002024
Serhiy Storchaka678db842013-01-26 12:16:36 +02002025 if (size == 0)
2026 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002027 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002028 if (size == 1)
2029 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002030
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002031 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002032 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 if (!res)
2034 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002035 if (max_char < 256)
2036 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2037 PyUnicode_1BYTE_DATA(res));
2038 else if (max_char < 0x10000)
2039 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2040 PyUnicode_2BYTE_DATA(res));
2041 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002043 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return res;
2045}
2046
2047PyObject*
2048PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2049{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002050 if (size < 0) {
2051 PyErr_SetString(PyExc_ValueError, "size must be positive");
2052 return NULL;
2053 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002054 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002056 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002058 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002060 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002061 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002062 PyErr_SetString(PyExc_SystemError, "invalid kind");
2063 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065}
2066
Victor Stinnerece58de2012-04-23 23:36:38 +02002067Py_UCS4
2068_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2069{
2070 enum PyUnicode_Kind kind;
2071 void *startptr, *endptr;
2072
2073 assert(PyUnicode_IS_READY(unicode));
2074 assert(0 <= start);
2075 assert(end <= PyUnicode_GET_LENGTH(unicode));
2076 assert(start <= end);
2077
2078 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2079 return PyUnicode_MAX_CHAR_VALUE(unicode);
2080
2081 if (start == end)
2082 return 127;
2083
Victor Stinner94d558b2012-04-27 22:26:58 +02002084 if (PyUnicode_IS_ASCII(unicode))
2085 return 127;
2086
Victor Stinnerece58de2012-04-23 23:36:38 +02002087 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002088 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002089 endptr = (char *)startptr + end * kind;
2090 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002091 switch(kind) {
2092 case PyUnicode_1BYTE_KIND:
2093 return ucs1lib_find_max_char(startptr, endptr);
2094 case PyUnicode_2BYTE_KIND:
2095 return ucs2lib_find_max_char(startptr, endptr);
2096 case PyUnicode_4BYTE_KIND:
2097 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002098 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002099 assert(0);
2100 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002101 }
2102}
2103
Victor Stinner25a4b292011-10-06 12:31:55 +02002104/* Ensure that a string uses the most efficient storage, if it is not the
2105 case: create a new string with of the right kind. Write NULL into *p_unicode
2106 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002107static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002108unicode_adjust_maxchar(PyObject **p_unicode)
2109{
2110 PyObject *unicode, *copy;
2111 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002112 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002113 unsigned int kind;
2114
2115 assert(p_unicode != NULL);
2116 unicode = *p_unicode;
2117 assert(PyUnicode_IS_READY(unicode));
2118 if (PyUnicode_IS_ASCII(unicode))
2119 return;
2120
2121 len = PyUnicode_GET_LENGTH(unicode);
2122 kind = PyUnicode_KIND(unicode);
2123 if (kind == PyUnicode_1BYTE_KIND) {
2124 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002125 max_char = ucs1lib_find_max_char(u, u + len);
2126 if (max_char >= 128)
2127 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002128 }
2129 else if (kind == PyUnicode_2BYTE_KIND) {
2130 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002131 max_char = ucs2lib_find_max_char(u, u + len);
2132 if (max_char >= 256)
2133 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002134 }
2135 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002136 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002137 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002138 max_char = ucs4lib_find_max_char(u, u + len);
2139 if (max_char >= 0x10000)
2140 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002141 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002142 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002143 if (copy != NULL)
2144 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002145 Py_DECREF(unicode);
2146 *p_unicode = copy;
2147}
2148
Victor Stinner034f6cf2011-09-30 02:26:44 +02002149PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002150_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002151{
Victor Stinner87af4f22011-11-21 23:03:47 +01002152 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002153 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002154
Victor Stinner034f6cf2011-09-30 02:26:44 +02002155 if (!PyUnicode_Check(unicode)) {
2156 PyErr_BadInternalCall();
2157 return NULL;
2158 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002159 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002160 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002161
Victor Stinner87af4f22011-11-21 23:03:47 +01002162 length = PyUnicode_GET_LENGTH(unicode);
2163 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002164 if (!copy)
2165 return NULL;
2166 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2167
Victor Stinner87af4f22011-11-21 23:03:47 +01002168 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2169 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002170 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002171 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002172}
2173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174
Victor Stinnerbc603d12011-10-02 01:00:40 +02002175/* Widen Unicode objects to larger buffers. Don't write terminating null
2176 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177
2178void*
2179_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2180{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002181 Py_ssize_t len;
2182 void *result;
2183 unsigned int skind;
2184
Benjamin Petersonbac79492012-01-14 13:34:47 -05002185 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002186 return NULL;
2187
2188 len = PyUnicode_GET_LENGTH(s);
2189 skind = PyUnicode_KIND(s);
2190 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002191 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 return NULL;
2193 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002194 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002195 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002196 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002197 if (!result)
2198 return PyErr_NoMemory();
2199 assert(skind == PyUnicode_1BYTE_KIND);
2200 _PyUnicode_CONVERT_BYTES(
2201 Py_UCS1, Py_UCS2,
2202 PyUnicode_1BYTE_DATA(s),
2203 PyUnicode_1BYTE_DATA(s) + len,
2204 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002206 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002207 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002208 if (!result)
2209 return PyErr_NoMemory();
2210 if (skind == PyUnicode_2BYTE_KIND) {
2211 _PyUnicode_CONVERT_BYTES(
2212 Py_UCS2, Py_UCS4,
2213 PyUnicode_2BYTE_DATA(s),
2214 PyUnicode_2BYTE_DATA(s) + len,
2215 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002217 else {
2218 assert(skind == PyUnicode_1BYTE_KIND);
2219 _PyUnicode_CONVERT_BYTES(
2220 Py_UCS1, Py_UCS4,
2221 PyUnicode_1BYTE_DATA(s),
2222 PyUnicode_1BYTE_DATA(s) + len,
2223 result);
2224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002226 default:
2227 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 }
Victor Stinner01698042011-10-04 00:04:26 +02002229 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 return NULL;
2231}
2232
2233static Py_UCS4*
2234as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2235 int copy_null)
2236{
2237 int kind;
2238 void *data;
2239 Py_ssize_t len, targetlen;
2240 if (PyUnicode_READY(string) == -1)
2241 return NULL;
2242 kind = PyUnicode_KIND(string);
2243 data = PyUnicode_DATA(string);
2244 len = PyUnicode_GET_LENGTH(string);
2245 targetlen = len;
2246 if (copy_null)
2247 targetlen++;
2248 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002249 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002250 if (!target) {
2251 PyErr_NoMemory();
2252 return NULL;
2253 }
2254 }
2255 else {
2256 if (targetsize < targetlen) {
2257 PyErr_Format(PyExc_SystemError,
2258 "string is longer than the buffer");
2259 if (copy_null && 0 < targetsize)
2260 target[0] = 0;
2261 return NULL;
2262 }
2263 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002264 if (kind == PyUnicode_1BYTE_KIND) {
2265 Py_UCS1 *start = (Py_UCS1 *) data;
2266 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002268 else if (kind == PyUnicode_2BYTE_KIND) {
2269 Py_UCS2 *start = (Py_UCS2 *) data;
2270 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2271 }
2272 else {
2273 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 if (copy_null)
2277 target[len] = 0;
2278 return target;
2279}
2280
2281Py_UCS4*
2282PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2283 int copy_null)
2284{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002285 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 PyErr_BadInternalCall();
2287 return NULL;
2288 }
2289 return as_ucs4(string, target, targetsize, copy_null);
2290}
2291
2292Py_UCS4*
2293PyUnicode_AsUCS4Copy(PyObject *string)
2294{
2295 return as_ucs4(string, NULL, 0, 1);
2296}
2297
2298#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002299
Alexander Belopolsky40018472011-02-26 01:02:56 +00002300PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002301PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002305 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002306 PyErr_BadInternalCall();
2307 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 }
2309
Martin v. Löwis790465f2008-04-05 20:41:37 +00002310 if (size == -1) {
2311 size = wcslen(w);
2312 }
2313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315}
2316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002318
Victor Stinner15a11362012-10-06 23:48:20 +02002319/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002320 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2321 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2322#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002323
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002324static int
2325unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2326 Py_ssize_t width, Py_ssize_t precision)
2327{
2328 Py_ssize_t length, fill, arglen;
2329 Py_UCS4 maxchar;
2330
2331 if (PyUnicode_READY(str) == -1)
2332 return -1;
2333
2334 length = PyUnicode_GET_LENGTH(str);
2335 if ((precision == -1 || precision >= length)
2336 && width <= length)
2337 return _PyUnicodeWriter_WriteStr(writer, str);
2338
2339 if (precision != -1)
2340 length = Py_MIN(precision, length);
2341
2342 arglen = Py_MAX(length, width);
2343 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2344 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2345 else
2346 maxchar = writer->maxchar;
2347
2348 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2349 return -1;
2350
2351 if (width > length) {
2352 fill = width - length;
2353 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2354 return -1;
2355 writer->pos += fill;
2356 }
2357
2358 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2359 str, 0, length);
2360 writer->pos += length;
2361 return 0;
2362}
2363
2364static int
2365unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2366 Py_ssize_t width, Py_ssize_t precision)
2367{
2368 /* UTF-8 */
2369 Py_ssize_t length;
2370 PyObject *unicode;
2371 int res;
2372
2373 length = strlen(str);
2374 if (precision != -1)
2375 length = Py_MIN(length, precision);
2376 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2377 if (unicode == NULL)
2378 return -1;
2379
2380 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2381 Py_DECREF(unicode);
2382 return res;
2383}
2384
Victor Stinner96865452011-03-01 23:44:09 +00002385static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002386unicode_fromformat_arg(_PyUnicodeWriter *writer,
2387 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002388{
Victor Stinnere215d962012-10-06 23:03:36 +02002389 const char *p;
2390 Py_ssize_t len;
2391 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002392 Py_ssize_t width;
2393 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002394 int longflag;
2395 int longlongflag;
2396 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002397 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002398
2399 p = f;
2400 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002401 zeropad = 0;
2402 if (*f == '0') {
2403 zeropad = 1;
2404 f++;
2405 }
Victor Stinner96865452011-03-01 23:44:09 +00002406
2407 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002408 width = -1;
2409 if (Py_ISDIGIT((unsigned)*f)) {
2410 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002411 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002412 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002413 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002414 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002415 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002416 return NULL;
2417 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002419 f++;
2420 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002421 }
2422 precision = -1;
2423 if (*f == '.') {
2424 f++;
2425 if (Py_ISDIGIT((unsigned)*f)) {
2426 precision = (*f - '0');
2427 f++;
2428 while (Py_ISDIGIT((unsigned)*f)) {
2429 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2430 PyErr_SetString(PyExc_ValueError,
2431 "precision too big");
2432 return NULL;
2433 }
2434 precision = (precision * 10) + (*f - '0');
2435 f++;
2436 }
2437 }
Victor Stinner96865452011-03-01 23:44:09 +00002438 if (*f == '%') {
2439 /* "%.3%s" => f points to "3" */
2440 f--;
2441 }
2442 }
2443 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002444 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002445 f--;
2446 }
Victor Stinner96865452011-03-01 23:44:09 +00002447
2448 /* Handle %ld, %lu, %lld and %llu. */
2449 longflag = 0;
2450 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002451 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002452 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002453 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002454 longflag = 1;
2455 ++f;
2456 }
2457#ifdef HAVE_LONG_LONG
2458 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002459 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002460 longlongflag = 1;
2461 f += 2;
2462 }
2463#endif
2464 }
2465 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002466 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002467 size_tflag = 1;
2468 ++f;
2469 }
Victor Stinnere215d962012-10-06 23:03:36 +02002470
2471 if (f[1] == '\0')
2472 writer->overallocate = 0;
2473
2474 switch (*f) {
2475 case 'c':
2476 {
2477 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002478 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002479 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002480 "character argument not in range(0x110000)");
2481 return NULL;
2482 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002483 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002484 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002485 break;
2486 }
2487
2488 case 'i':
2489 case 'd':
2490 case 'u':
2491 case 'x':
2492 {
2493 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002494 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002495 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002496
2497 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002498 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002499 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002500 va_arg(*vargs, unsigned long));
2501#ifdef HAVE_LONG_LONG
2502 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002503 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002504 va_arg(*vargs, unsigned PY_LONG_LONG));
2505#endif
2506 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002507 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002508 va_arg(*vargs, size_t));
2509 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002510 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002511 va_arg(*vargs, unsigned int));
2512 }
2513 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002514 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002515 }
2516 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002517 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002518 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002519 va_arg(*vargs, long));
2520#ifdef HAVE_LONG_LONG
2521 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002522 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002523 va_arg(*vargs, PY_LONG_LONG));
2524#endif
2525 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002526 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002527 va_arg(*vargs, Py_ssize_t));
2528 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002529 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002530 va_arg(*vargs, int));
2531 }
2532 assert(len >= 0);
2533
Victor Stinnere215d962012-10-06 23:03:36 +02002534 if (precision < len)
2535 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002536
2537 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002538 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2539 return NULL;
2540
Victor Stinnere215d962012-10-06 23:03:36 +02002541 if (width > precision) {
2542 Py_UCS4 fillchar;
2543 fill = width - precision;
2544 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002545 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2546 return NULL;
2547 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002548 }
Victor Stinner15a11362012-10-06 23:48:20 +02002549 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002550 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002551 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2552 return NULL;
2553 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002554 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002555
Victor Stinner4a587072013-11-19 12:54:53 +01002556 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2557 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002558 break;
2559 }
2560
2561 case 'p':
2562 {
2563 char number[MAX_LONG_LONG_CHARS];
2564
2565 len = sprintf(number, "%p", va_arg(*vargs, void*));
2566 assert(len >= 0);
2567
2568 /* %p is ill-defined: ensure leading 0x. */
2569 if (number[1] == 'X')
2570 number[1] = 'x';
2571 else if (number[1] != 'x') {
2572 memmove(number + 2, number,
2573 strlen(number) + 1);
2574 number[0] = '0';
2575 number[1] = 'x';
2576 len += 2;
2577 }
2578
Victor Stinner4a587072013-11-19 12:54:53 +01002579 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002580 return NULL;
2581 break;
2582 }
2583
2584 case 's':
2585 {
2586 /* UTF-8 */
2587 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002588 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002589 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002590 break;
2591 }
2592
2593 case 'U':
2594 {
2595 PyObject *obj = va_arg(*vargs, PyObject *);
2596 assert(obj && _PyUnicode_CHECK(obj));
2597
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002598 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002599 return NULL;
2600 break;
2601 }
2602
2603 case 'V':
2604 {
2605 PyObject *obj = va_arg(*vargs, PyObject *);
2606 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002607 if (obj) {
2608 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002609 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002610 return NULL;
2611 }
2612 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 assert(str != NULL);
2614 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002615 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002616 }
2617 break;
2618 }
2619
2620 case 'S':
2621 {
2622 PyObject *obj = va_arg(*vargs, PyObject *);
2623 PyObject *str;
2624 assert(obj);
2625 str = PyObject_Str(obj);
2626 if (!str)
2627 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002629 Py_DECREF(str);
2630 return NULL;
2631 }
2632 Py_DECREF(str);
2633 break;
2634 }
2635
2636 case 'R':
2637 {
2638 PyObject *obj = va_arg(*vargs, PyObject *);
2639 PyObject *repr;
2640 assert(obj);
2641 repr = PyObject_Repr(obj);
2642 if (!repr)
2643 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002644 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002645 Py_DECREF(repr);
2646 return NULL;
2647 }
2648 Py_DECREF(repr);
2649 break;
2650 }
2651
2652 case 'A':
2653 {
2654 PyObject *obj = va_arg(*vargs, PyObject *);
2655 PyObject *ascii;
2656 assert(obj);
2657 ascii = PyObject_ASCII(obj);
2658 if (!ascii)
2659 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002660 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002661 Py_DECREF(ascii);
2662 return NULL;
2663 }
2664 Py_DECREF(ascii);
2665 break;
2666 }
2667
2668 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002669 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002670 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002671 break;
2672
2673 default:
2674 /* if we stumble upon an unknown formatting code, copy the rest
2675 of the format string to the output string. (we cannot just
2676 skip the code, since there's no way to know what's in the
2677 argument list) */
2678 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002679 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002680 return NULL;
2681 f = p+len;
2682 return f;
2683 }
2684
2685 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002686 return f;
2687}
2688
Walter Dörwaldd2034312007-05-18 16:29:38 +00002689PyObject *
2690PyUnicode_FromFormatV(const char *format, va_list vargs)
2691{
Victor Stinnere215d962012-10-06 23:03:36 +02002692 va_list vargs2;
2693 const char *f;
2694 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002695
Victor Stinner8f674cc2013-04-17 23:02:17 +02002696 _PyUnicodeWriter_Init(&writer);
2697 writer.min_length = strlen(format) + 100;
2698 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002699
2700 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2701 Copy it to be able to pass a reference to a subfunction. */
2702 Py_VA_COPY(vargs2, vargs);
2703
2704 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002706 f = unicode_fromformat_arg(&writer, f, &vargs2);
2707 if (f == NULL)
2708 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002710 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002711 const char *p;
2712 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713
Victor Stinnere215d962012-10-06 23:03:36 +02002714 p = f;
2715 do
2716 {
2717 if ((unsigned char)*p > 127) {
2718 PyErr_Format(PyExc_ValueError,
2719 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2720 "string, got a non-ASCII byte: 0x%02x",
2721 (unsigned char)*p);
2722 return NULL;
2723 }
2724 p++;
2725 }
2726 while (*p != '\0' && *p != '%');
2727 len = p - f;
2728
2729 if (*p == '\0')
2730 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002731
2732 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002733 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002734
2735 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 }
Victor Stinnere215d962012-10-06 23:03:36 +02002738 return _PyUnicodeWriter_Finish(&writer);
2739
2740 fail:
2741 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002742 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002743}
2744
Walter Dörwaldd2034312007-05-18 16:29:38 +00002745PyObject *
2746PyUnicode_FromFormat(const char *format, ...)
2747{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002748 PyObject* ret;
2749 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002750
2751#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002752 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002753#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002754 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002755#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002756 ret = PyUnicode_FromFormatV(format, vargs);
2757 va_end(vargs);
2758 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002759}
2760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761#ifdef HAVE_WCHAR_H
2762
Victor Stinner5593d8a2010-10-02 11:11:27 +00002763/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2764 convert a Unicode object to a wide character string.
2765
Victor Stinnerd88d9832011-09-06 02:00:05 +02002766 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002767 character) required to convert the unicode object. Ignore size argument.
2768
Victor Stinnerd88d9832011-09-06 02:00:05 +02002769 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002770 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002771 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002772static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002773unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002774 wchar_t *w,
2775 Py_ssize_t size)
2776{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002777 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 const wchar_t *wstr;
2779
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002780 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781 if (wstr == NULL)
2782 return -1;
2783
Victor Stinner5593d8a2010-10-02 11:11:27 +00002784 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785 if (size > res)
2786 size = res + 1;
2787 else
2788 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002790 return res;
2791 }
2792 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002794}
2795
2796Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002797PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002798 wchar_t *w,
2799 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800{
2801 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002802 PyErr_BadInternalCall();
2803 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002805 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806}
2807
Victor Stinner137c34c2010-09-29 10:25:54 +00002808wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002809PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002810 Py_ssize_t *size)
2811{
2812 wchar_t* buffer;
2813 Py_ssize_t buflen;
2814
2815 if (unicode == NULL) {
2816 PyErr_BadInternalCall();
2817 return NULL;
2818 }
2819
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002820 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 if (buflen == -1)
2822 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002823 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002824 if (buffer == NULL) {
2825 PyErr_NoMemory();
2826 return NULL;
2827 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002828 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002829 if (buflen == -1) {
2830 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002831 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002832 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002833 if (size != NULL)
2834 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002835 return buffer;
2836}
2837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002838#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839
Alexander Belopolsky40018472011-02-26 01:02:56 +00002840PyObject *
2841PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002842{
Victor Stinner8faf8212011-12-08 22:14:11 +01002843 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002844 PyErr_SetString(PyExc_ValueError,
2845 "chr() arg not in range(0x110000)");
2846 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002847 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002848
Victor Stinner985a82a2014-01-03 12:53:47 +01002849 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002850}
2851
Alexander Belopolsky40018472011-02-26 01:02:56 +00002852PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002853PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002855 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002856 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002857 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002858 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002859 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 Py_INCREF(obj);
2861 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002862 }
2863 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 /* For a Unicode subtype that's not a Unicode object,
2865 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002866 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002867 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002868 PyErr_Format(PyExc_TypeError,
2869 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002870 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002871 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002872}
2873
Alexander Belopolsky40018472011-02-26 01:02:56 +00002874PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002875PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002876 const char *encoding,
2877 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002878{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002880 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002881
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002883 PyErr_BadInternalCall();
2884 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002886
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002887 /* Decoding bytes objects is the most common case and should be fast */
2888 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002889 if (PyBytes_GET_SIZE(obj) == 0)
2890 _Py_RETURN_UNICODE_EMPTY();
2891 v = PyUnicode_Decode(
2892 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2893 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002894 return v;
2895 }
2896
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002897 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002898 PyErr_SetString(PyExc_TypeError,
2899 "decoding str is not supported");
2900 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002901 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002902
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002903 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2904 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2905 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002906 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002907 Py_TYPE(obj)->tp_name);
2908 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002909 }
Tim Petersced69f82003-09-16 20:30:58 +00002910
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002911 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002912 PyBuffer_Release(&buffer);
2913 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002915
Serhiy Storchaka05997252013-01-26 12:14:02 +02002916 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002917 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002918 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919}
2920
Victor Stinner600d3be2010-06-10 12:00:55 +00002921/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002922 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2923 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002924int
2925_Py_normalize_encoding(const char *encoding,
2926 char *lower,
2927 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002929 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002930 char *l;
2931 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002932
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002933 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002934 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002935 if (lower_len < 6)
2936 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002937 strcpy(lower, "utf-8");
2938 return 1;
2939 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002940 e = encoding;
2941 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002942 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002943 while (*e) {
2944 if (l == l_end)
2945 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002946 if (Py_ISUPPER(*e)) {
2947 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002948 }
2949 else if (*e == '_') {
2950 *l++ = '-';
2951 e++;
2952 }
2953 else {
2954 *l++ = *e++;
2955 }
2956 }
2957 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002958 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002959}
2960
Alexander Belopolsky40018472011-02-26 01:02:56 +00002961PyObject *
2962PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002963 Py_ssize_t size,
2964 const char *encoding,
2965 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002966{
2967 PyObject *buffer = NULL, *unicode;
2968 Py_buffer info;
2969 char lower[11]; /* Enough for any encoding shortcut */
2970
Fred Drakee4315f52000-05-09 19:53:39 +00002971 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002972 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002973 if ((strcmp(lower, "utf-8") == 0) ||
2974 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002975 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002976 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002977 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01002978 (strcmp(lower, "iso-8859-1") == 0) ||
2979 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002980 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002981#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002982 else if (strcmp(lower, "mbcs") == 0)
2983 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002984#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002985 else if (strcmp(lower, "ascii") == 0)
2986 return PyUnicode_DecodeASCII(s, size, errors);
2987 else if (strcmp(lower, "utf-16") == 0)
2988 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2989 else if (strcmp(lower, "utf-32") == 0)
2990 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992
2993 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002994 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002995 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002996 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002997 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 if (buffer == NULL)
2999 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003000 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 if (unicode == NULL)
3002 goto onError;
3003 if (!PyUnicode_Check(unicode)) {
3004 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003005 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3006 "use codecs.decode() to decode to arbitrary types",
3007 encoding,
3008 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009 Py_DECREF(unicode);
3010 goto onError;
3011 }
3012 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003013 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003014
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016 Py_XDECREF(buffer);
3017 return NULL;
3018}
3019
Alexander Belopolsky40018472011-02-26 01:02:56 +00003020PyObject *
3021PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003022 const char *encoding,
3023 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003024{
3025 PyObject *v;
3026
3027 if (!PyUnicode_Check(unicode)) {
3028 PyErr_BadArgument();
3029 goto onError;
3030 }
3031
3032 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003034
3035 /* Decode via the codec registry */
3036 v = PyCodec_Decode(unicode, encoding, errors);
3037 if (v == NULL)
3038 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003039 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003040
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003042 return NULL;
3043}
3044
Alexander Belopolsky40018472011-02-26 01:02:56 +00003045PyObject *
3046PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003047 const char *encoding,
3048 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003049{
3050 PyObject *v;
3051
3052 if (!PyUnicode_Check(unicode)) {
3053 PyErr_BadArgument();
3054 goto onError;
3055 }
3056
3057 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003059
3060 /* Decode via the codec registry */
3061 v = PyCodec_Decode(unicode, encoding, errors);
3062 if (v == NULL)
3063 goto onError;
3064 if (!PyUnicode_Check(v)) {
3065 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003066 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3067 "use codecs.decode() to decode to arbitrary types",
3068 encoding,
3069 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003070 Py_DECREF(v);
3071 goto onError;
3072 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003073 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003074
Benjamin Peterson29060642009-01-31 22:14:21 +00003075 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003076 return NULL;
3077}
3078
Alexander Belopolsky40018472011-02-26 01:02:56 +00003079PyObject *
3080PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003081 Py_ssize_t size,
3082 const char *encoding,
3083 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084{
3085 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003086
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 unicode = PyUnicode_FromUnicode(s, size);
3088 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003089 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3091 Py_DECREF(unicode);
3092 return v;
3093}
3094
Alexander Belopolsky40018472011-02-26 01:02:56 +00003095PyObject *
3096PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003097 const char *encoding,
3098 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003099{
3100 PyObject *v;
3101
3102 if (!PyUnicode_Check(unicode)) {
3103 PyErr_BadArgument();
3104 goto onError;
3105 }
3106
3107 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003108 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003109
3110 /* Encode via the codec registry */
3111 v = PyCodec_Encode(unicode, encoding, errors);
3112 if (v == NULL)
3113 goto onError;
3114 return v;
3115
Benjamin Peterson29060642009-01-31 22:14:21 +00003116 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003117 return NULL;
3118}
3119
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003120static size_t
3121wcstombs_errorpos(const wchar_t *wstr)
3122{
3123 size_t len;
3124#if SIZEOF_WCHAR_T == 2
3125 wchar_t buf[3];
3126#else
3127 wchar_t buf[2];
3128#endif
3129 char outbuf[MB_LEN_MAX];
3130 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003131
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003132#if SIZEOF_WCHAR_T == 2
3133 buf[2] = 0;
3134#else
3135 buf[1] = 0;
3136#endif
3137 start = wstr;
3138 while (*wstr != L'\0')
3139 {
3140 previous = wstr;
3141#if SIZEOF_WCHAR_T == 2
3142 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3143 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3144 {
3145 buf[0] = wstr[0];
3146 buf[1] = wstr[1];
3147 wstr += 2;
3148 }
3149 else {
3150 buf[0] = *wstr;
3151 buf[1] = 0;
3152 wstr++;
3153 }
3154#else
3155 buf[0] = *wstr;
3156 wstr++;
3157#endif
3158 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003159 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003160 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003161 }
3162
3163 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003164 return 0;
3165}
3166
Victor Stinner1b579672011-12-17 05:47:23 +01003167static int
3168locale_error_handler(const char *errors, int *surrogateescape)
3169{
3170 if (errors == NULL) {
3171 *surrogateescape = 0;
3172 return 0;
3173 }
3174
3175 if (strcmp(errors, "strict") == 0) {
3176 *surrogateescape = 0;
3177 return 0;
3178 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003179 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003180 *surrogateescape = 1;
3181 return 0;
3182 }
3183 PyErr_Format(PyExc_ValueError,
3184 "only 'strict' and 'surrogateescape' error handlers "
3185 "are supported, not '%s'",
3186 errors);
3187 return -1;
3188}
3189
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003190PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003191PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003192{
3193 Py_ssize_t wlen, wlen2;
3194 wchar_t *wstr;
3195 PyObject *bytes = NULL;
3196 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003197 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003198 PyObject *exc;
3199 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003200 int surrogateescape;
3201
3202 if (locale_error_handler(errors, &surrogateescape) < 0)
3203 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003204
3205 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3206 if (wstr == NULL)
3207 return NULL;
3208
3209 wlen2 = wcslen(wstr);
3210 if (wlen2 != wlen) {
3211 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003212 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003213 return NULL;
3214 }
3215
3216 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003217 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003218 char *str;
3219
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003220 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003221 if (str == NULL) {
3222 if (error_pos == (size_t)-1) {
3223 PyErr_NoMemory();
3224 PyMem_Free(wstr);
3225 return NULL;
3226 }
3227 else {
3228 goto encode_error;
3229 }
3230 }
3231 PyMem_Free(wstr);
3232
3233 bytes = PyBytes_FromString(str);
3234 PyMem_Free(str);
3235 }
3236 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003237 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003238 size_t len, len2;
3239
3240 len = wcstombs(NULL, wstr, 0);
3241 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003242 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003243 goto encode_error;
3244 }
3245
3246 bytes = PyBytes_FromStringAndSize(NULL, len);
3247 if (bytes == NULL) {
3248 PyMem_Free(wstr);
3249 return NULL;
3250 }
3251
3252 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3253 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003254 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003255 goto encode_error;
3256 }
3257 PyMem_Free(wstr);
3258 }
3259 return bytes;
3260
3261encode_error:
3262 errmsg = strerror(errno);
3263 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003264
3265 if (error_pos == (size_t)-1)
3266 error_pos = wcstombs_errorpos(wstr);
3267
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003268 PyMem_Free(wstr);
3269 Py_XDECREF(bytes);
3270
Victor Stinner2f197072011-12-17 07:08:30 +01003271 if (errmsg != NULL) {
3272 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003273 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003274 if (wstr != NULL) {
3275 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003276 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003277 } else
3278 errmsg = NULL;
3279 }
3280 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003281 reason = PyUnicode_FromString(
3282 "wcstombs() encountered an unencodable "
3283 "wide character");
3284 if (reason == NULL)
3285 return NULL;
3286
3287 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3288 "locale", unicode,
3289 (Py_ssize_t)error_pos,
3290 (Py_ssize_t)(error_pos+1),
3291 reason);
3292 Py_DECREF(reason);
3293 if (exc != NULL) {
3294 PyCodec_StrictErrors(exc);
3295 Py_XDECREF(exc);
3296 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003297 return NULL;
3298}
3299
Victor Stinnerad158722010-10-27 00:25:46 +00003300PyObject *
3301PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003302{
Victor Stinner99b95382011-07-04 14:23:54 +02003303#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003304 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003305#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003306 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003307#else
Victor Stinner793b5312011-04-27 00:24:21 +02003308 PyInterpreterState *interp = PyThreadState_GET()->interp;
3309 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3310 cannot use it to encode and decode filenames before it is loaded. Load
3311 the Python codec requires to encode at least its own filename. Use the C
3312 version of the locale codec until the codec registry is initialized and
3313 the Python codec is loaded.
3314
3315 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3316 cannot only rely on it: check also interp->fscodec_initialized for
3317 subinterpreters. */
3318 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003319 return PyUnicode_AsEncodedString(unicode,
3320 Py_FileSystemDefaultEncoding,
3321 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003322 }
3323 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003324 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003325 }
Victor Stinnerad158722010-10-27 00:25:46 +00003326#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003327}
3328
Alexander Belopolsky40018472011-02-26 01:02:56 +00003329PyObject *
3330PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003331 const char *encoding,
3332 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333{
3334 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003335 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003336
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 if (!PyUnicode_Check(unicode)) {
3338 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 }
Fred Drakee4315f52000-05-09 19:53:39 +00003341
Fred Drakee4315f52000-05-09 19:53:39 +00003342 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003343 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003344 if ((strcmp(lower, "utf-8") == 0) ||
3345 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003346 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003347 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003348 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003349 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003350 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003351 }
Victor Stinner37296e82010-06-10 13:36:23 +00003352 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003353 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003354 (strcmp(lower, "iso-8859-1") == 0) ||
3355 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003356 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003357#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003358 else if (strcmp(lower, "mbcs") == 0)
3359 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003360#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003361 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003362 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003363 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364
3365 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003366 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003368 return NULL;
3369
3370 /* The normal path */
3371 if (PyBytes_Check(v))
3372 return v;
3373
3374 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003375 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003376 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003377 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003378
3379 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003380 "encoder %s returned bytearray instead of bytes; "
3381 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003382 encoding);
3383 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003384 Py_DECREF(v);
3385 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003386 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003387
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003388 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3389 Py_DECREF(v);
3390 return b;
3391 }
3392
3393 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003394 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3395 "use codecs.encode() to encode to arbitrary types",
3396 encoding,
3397 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003398 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003399 return NULL;
3400}
3401
Alexander Belopolsky40018472011-02-26 01:02:56 +00003402PyObject *
3403PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003404 const char *encoding,
3405 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003406{
3407 PyObject *v;
3408
3409 if (!PyUnicode_Check(unicode)) {
3410 PyErr_BadArgument();
3411 goto onError;
3412 }
3413
3414 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003415 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003416
3417 /* Encode via the codec registry */
3418 v = PyCodec_Encode(unicode, encoding, errors);
3419 if (v == NULL)
3420 goto onError;
3421 if (!PyUnicode_Check(v)) {
3422 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003423 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3424 "use codecs.encode() to encode to arbitrary types",
3425 encoding,
3426 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003427 Py_DECREF(v);
3428 goto onError;
3429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003431
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 return NULL;
3434}
3435
Victor Stinner2f197072011-12-17 07:08:30 +01003436static size_t
3437mbstowcs_errorpos(const char *str, size_t len)
3438{
3439#ifdef HAVE_MBRTOWC
3440 const char *start = str;
3441 mbstate_t mbs;
3442 size_t converted;
3443 wchar_t ch;
3444
3445 memset(&mbs, 0, sizeof mbs);
3446 while (len)
3447 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003448 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003449 if (converted == 0)
3450 /* Reached end of string */
3451 break;
3452 if (converted == (size_t)-1 || converted == (size_t)-2) {
3453 /* Conversion error or incomplete character */
3454 return str - start;
3455 }
3456 else {
3457 str += converted;
3458 len -= converted;
3459 }
3460 }
3461 /* failed to find the undecodable byte sequence */
3462 return 0;
3463#endif
3464 return 0;
3465}
3466
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003467PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003468PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003469 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003470{
3471 wchar_t smallbuf[256];
3472 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3473 wchar_t *wstr;
3474 size_t wlen, wlen2;
3475 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003476 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003477 size_t error_pos;
3478 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003479 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3480 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003481
3482 if (locale_error_handler(errors, &surrogateescape) < 0)
3483 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003484
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003485 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3486 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003487 return NULL;
3488 }
3489
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003490 if (surrogateescape) {
3491 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003492 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003493 if (wstr == NULL) {
3494 if (wlen == (size_t)-1)
3495 PyErr_NoMemory();
3496 else
3497 PyErr_SetFromErrno(PyExc_OSError);
3498 return NULL;
3499 }
3500
3501 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003502 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003503 }
3504 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003505 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003506#ifndef HAVE_BROKEN_MBSTOWCS
3507 wlen = mbstowcs(NULL, str, 0);
3508#else
3509 wlen = len;
3510#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003511 if (wlen == (size_t)-1)
3512 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003513 if (wlen+1 <= smallbuf_len) {
3514 wstr = smallbuf;
3515 }
3516 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003517 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003518 if (!wstr)
3519 return PyErr_NoMemory();
3520 }
3521
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003522 wlen2 = mbstowcs(wstr, str, wlen+1);
3523 if (wlen2 == (size_t)-1) {
3524 if (wstr != smallbuf)
3525 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003526 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003527 }
3528#ifdef HAVE_BROKEN_MBSTOWCS
3529 assert(wlen2 == wlen);
3530#endif
3531 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3532 if (wstr != smallbuf)
3533 PyMem_Free(wstr);
3534 }
3535 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003536
3537decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003538 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003539 errmsg = strerror(errno);
3540 assert(errmsg != NULL);
3541
3542 error_pos = mbstowcs_errorpos(str, len);
3543 if (errmsg != NULL) {
3544 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003545 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003546 if (wstr != NULL) {
3547 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003548 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003549 }
Victor Stinner2f197072011-12-17 07:08:30 +01003550 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003551 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003552 reason = PyUnicode_FromString(
3553 "mbstowcs() encountered an invalid multibyte sequence");
3554 if (reason == NULL)
3555 return NULL;
3556
3557 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3558 "locale", str, len,
3559 (Py_ssize_t)error_pos,
3560 (Py_ssize_t)(error_pos+1),
3561 reason);
3562 Py_DECREF(reason);
3563 if (exc != NULL) {
3564 PyCodec_StrictErrors(exc);
3565 Py_XDECREF(exc);
3566 }
3567 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003568}
3569
3570PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003571PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003572{
3573 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003574 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003575}
3576
3577
3578PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003579PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003580 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003581 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3582}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003583
Christian Heimes5894ba72007-11-04 11:43:14 +00003584PyObject*
3585PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3586{
Victor Stinner99b95382011-07-04 14:23:54 +02003587#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003588 return PyUnicode_DecodeMBCS(s, size, NULL);
3589#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003590 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003591#else
Victor Stinner793b5312011-04-27 00:24:21 +02003592 PyInterpreterState *interp = PyThreadState_GET()->interp;
3593 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3594 cannot use it to encode and decode filenames before it is loaded. Load
3595 the Python codec requires to encode at least its own filename. Use the C
3596 version of the locale codec until the codec registry is initialized and
3597 the Python codec is loaded.
3598
3599 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3600 cannot only rely on it: check also interp->fscodec_initialized for
3601 subinterpreters. */
3602 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003603 return PyUnicode_Decode(s, size,
3604 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003605 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003606 }
3607 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003608 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003609 }
Victor Stinnerad158722010-10-27 00:25:46 +00003610#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003611}
3612
Martin v. Löwis011e8422009-05-05 04:43:17 +00003613
3614int
3615PyUnicode_FSConverter(PyObject* arg, void* addr)
3616{
3617 PyObject *output = NULL;
3618 Py_ssize_t size;
3619 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003620 if (arg == NULL) {
3621 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003622 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003623 return 1;
3624 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003625 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003626 output = arg;
3627 Py_INCREF(output);
3628 }
3629 else {
3630 arg = PyUnicode_FromObject(arg);
3631 if (!arg)
3632 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003633 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003634 Py_DECREF(arg);
3635 if (!output)
3636 return 0;
3637 if (!PyBytes_Check(output)) {
3638 Py_DECREF(output);
3639 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3640 return 0;
3641 }
3642 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003643 size = PyBytes_GET_SIZE(output);
3644 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003645 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003646 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003647 Py_DECREF(output);
3648 return 0;
3649 }
3650 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003651 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003652}
3653
3654
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003655int
3656PyUnicode_FSDecoder(PyObject* arg, void* addr)
3657{
3658 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003659 if (arg == NULL) {
3660 Py_DECREF(*(PyObject**)addr);
3661 return 1;
3662 }
3663 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003664 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003665 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003666 output = arg;
3667 Py_INCREF(output);
3668 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003669 else if (PyObject_CheckBuffer(arg)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003670 arg = PyBytes_FromObject(arg);
3671 if (!arg)
3672 return 0;
3673 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3674 PyBytes_GET_SIZE(arg));
3675 Py_DECREF(arg);
3676 if (!output)
3677 return 0;
3678 if (!PyUnicode_Check(output)) {
3679 Py_DECREF(output);
3680 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3681 return 0;
3682 }
3683 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003684 else {
3685 PyErr_Format(PyExc_TypeError,
3686 "path should be string or bytes, not %.200s",
3687 Py_TYPE(arg)->tp_name);
3688 return 0;
3689 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003690 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003691 Py_DECREF(output);
3692 return 0;
3693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003694 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003695 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003696 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003697 Py_DECREF(output);
3698 return 0;
3699 }
3700 *(PyObject**)addr = output;
3701 return Py_CLEANUP_SUPPORTED;
3702}
3703
3704
Martin v. Löwis5b222132007-06-10 09:51:05 +00003705char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003707{
Christian Heimesf3863112007-11-22 07:46:41 +00003708 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003709
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003710 if (!PyUnicode_Check(unicode)) {
3711 PyErr_BadArgument();
3712 return NULL;
3713 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003714 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003715 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003716
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003717 if (PyUnicode_UTF8(unicode) == NULL) {
3718 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003719 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3720 if (bytes == NULL)
3721 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003722 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3723 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003724 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003725 Py_DECREF(bytes);
3726 return NULL;
3727 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003728 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3729 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3730 PyBytes_AS_STRING(bytes),
3731 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003732 Py_DECREF(bytes);
3733 }
3734
3735 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003736 *psize = PyUnicode_UTF8_LENGTH(unicode);
3737 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003738}
3739
3740char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003742{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3744}
3745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003746Py_UNICODE *
3747PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3748{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003749 const unsigned char *one_byte;
3750#if SIZEOF_WCHAR_T == 4
3751 const Py_UCS2 *two_bytes;
3752#else
3753 const Py_UCS4 *four_bytes;
3754 const Py_UCS4 *ucs4_end;
3755 Py_ssize_t num_surrogates;
3756#endif
3757 wchar_t *w;
3758 wchar_t *wchar_end;
3759
3760 if (!PyUnicode_Check(unicode)) {
3761 PyErr_BadArgument();
3762 return NULL;
3763 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003764 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003766 assert(_PyUnicode_KIND(unicode) != 0);
3767 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003769 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003770#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003771 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3772 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003773 num_surrogates = 0;
3774
3775 for (; four_bytes < ucs4_end; ++four_bytes) {
3776 if (*four_bytes > 0xFFFF)
3777 ++num_surrogates;
3778 }
3779
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003780 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3781 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3782 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783 PyErr_NoMemory();
3784 return NULL;
3785 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003786 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003788 w = _PyUnicode_WSTR(unicode);
3789 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3790 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3792 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003793 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003794 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003795 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3796 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797 }
3798 else
3799 *w = *four_bytes;
3800
3801 if (w > wchar_end) {
3802 assert(0 && "Miscalculated string end");
3803 }
3804 }
3805 *w = 0;
3806#else
3807 /* sizeof(wchar_t) == 4 */
3808 Py_FatalError("Impossible unicode object state, wstr and str "
3809 "should share memory already.");
3810 return NULL;
3811#endif
3812 }
3813 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003814 if ((size_t)_PyUnicode_LENGTH(unicode) >
3815 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3816 PyErr_NoMemory();
3817 return NULL;
3818 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003819 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3820 (_PyUnicode_LENGTH(unicode) + 1));
3821 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822 PyErr_NoMemory();
3823 return NULL;
3824 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003825 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3826 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3827 w = _PyUnicode_WSTR(unicode);
3828 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003830 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3831 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832 for (; w < wchar_end; ++one_byte, ++w)
3833 *w = *one_byte;
3834 /* null-terminate the wstr */
3835 *w = 0;
3836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003837 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003839 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 for (; w < wchar_end; ++two_bytes, ++w)
3841 *w = *two_bytes;
3842 /* null-terminate the wstr */
3843 *w = 0;
3844#else
3845 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003846 PyObject_FREE(_PyUnicode_WSTR(unicode));
3847 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848 Py_FatalError("Impossible unicode object state, wstr "
3849 "and str should share memory already.");
3850 return NULL;
3851#endif
3852 }
3853 else {
3854 assert(0 && "This should never happen.");
3855 }
3856 }
3857 }
3858 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003859 *size = PyUnicode_WSTR_LENGTH(unicode);
3860 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003861}
3862
Alexander Belopolsky40018472011-02-26 01:02:56 +00003863Py_UNICODE *
3864PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867}
3868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869
Alexander Belopolsky40018472011-02-26 01:02:56 +00003870Py_ssize_t
3871PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872{
3873 if (!PyUnicode_Check(unicode)) {
3874 PyErr_BadArgument();
3875 goto onError;
3876 }
3877 return PyUnicode_GET_SIZE(unicode);
3878
Benjamin Peterson29060642009-01-31 22:14:21 +00003879 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880 return -1;
3881}
3882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883Py_ssize_t
3884PyUnicode_GetLength(PyObject *unicode)
3885{
Victor Stinner07621332012-06-16 04:53:46 +02003886 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003887 PyErr_BadArgument();
3888 return -1;
3889 }
Victor Stinner07621332012-06-16 04:53:46 +02003890 if (PyUnicode_READY(unicode) == -1)
3891 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 return PyUnicode_GET_LENGTH(unicode);
3893}
3894
3895Py_UCS4
3896PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3897{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003898 void *data;
3899 int kind;
3900
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003901 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3902 PyErr_BadArgument();
3903 return (Py_UCS4)-1;
3904 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003905 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003906 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 return (Py_UCS4)-1;
3908 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003909 data = PyUnicode_DATA(unicode);
3910 kind = PyUnicode_KIND(unicode);
3911 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912}
3913
3914int
3915PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3916{
3917 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003918 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 return -1;
3920 }
Victor Stinner488fa492011-12-12 00:01:39 +01003921 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003922 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003923 PyErr_SetString(PyExc_IndexError, "string index out of range");
3924 return -1;
3925 }
Victor Stinner488fa492011-12-12 00:01:39 +01003926 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003927 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003928 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3929 PyErr_SetString(PyExc_ValueError, "character out of range");
3930 return -1;
3931 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003932 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3933 index, ch);
3934 return 0;
3935}
3936
Alexander Belopolsky40018472011-02-26 01:02:56 +00003937const char *
3938PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003939{
Victor Stinner42cb4622010-09-01 19:39:01 +00003940 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003941}
3942
Victor Stinner554f3f02010-06-16 23:33:54 +00003943/* create or adjust a UnicodeDecodeError */
3944static void
3945make_decode_exception(PyObject **exceptionObject,
3946 const char *encoding,
3947 const char *input, Py_ssize_t length,
3948 Py_ssize_t startpos, Py_ssize_t endpos,
3949 const char *reason)
3950{
3951 if (*exceptionObject == NULL) {
3952 *exceptionObject = PyUnicodeDecodeError_Create(
3953 encoding, input, length, startpos, endpos, reason);
3954 }
3955 else {
3956 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3957 goto onError;
3958 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3959 goto onError;
3960 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3961 goto onError;
3962 }
3963 return;
3964
3965onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02003966 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00003967}
3968
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003969#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003970/* error handling callback helper:
3971 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003972 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973 and adjust various state variables.
3974 return 0 on success, -1 on error
3975*/
3976
Alexander Belopolsky40018472011-02-26 01:02:56 +00003977static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003978unicode_decode_call_errorhandler_wchar(
3979 const char *errors, PyObject **errorHandler,
3980 const char *encoding, const char *reason,
3981 const char **input, const char **inend, Py_ssize_t *startinpos,
3982 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3983 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003985 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003986
3987 PyObject *restuple = NULL;
3988 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003989 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003990 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003991 Py_ssize_t requiredsize;
3992 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003993 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003994 wchar_t *repwstr;
3995 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003997 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3998 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003999
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004001 *errorHandler = PyCodec_LookupError(errors);
4002 if (*errorHandler == NULL)
4003 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 }
4005
Victor Stinner554f3f02010-06-16 23:33:54 +00004006 make_decode_exception(exceptionObject,
4007 encoding,
4008 *input, *inend - *input,
4009 *startinpos, *endinpos,
4010 reason);
4011 if (*exceptionObject == NULL)
4012 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004013
4014 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4015 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004016 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004017 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004018 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004019 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020 }
4021 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004022 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004023
4024 /* Copy back the bytes variables, which might have been modified by the
4025 callback */
4026 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4027 if (!inputobj)
4028 goto onError;
4029 if (!PyBytes_Check(inputobj)) {
4030 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4031 }
4032 *input = PyBytes_AS_STRING(inputobj);
4033 insize = PyBytes_GET_SIZE(inputobj);
4034 *inend = *input + insize;
4035 /* we can DECREF safely, as the exception has another reference,
4036 so the object won't go away. */
4037 Py_DECREF(inputobj);
4038
4039 if (newpos<0)
4040 newpos = insize+newpos;
4041 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004042 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004043 goto onError;
4044 }
4045
4046 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4047 if (repwstr == NULL)
4048 goto onError;
4049 /* need more space? (at least enough for what we
4050 have+the replacement+the rest of the string (starting
4051 at the new input position), so we won't have to check space
4052 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004053 requiredsize = *outpos;
4054 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4055 goto overflow;
4056 requiredsize += repwlen;
4057 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4058 goto overflow;
4059 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004060 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004061 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004062 requiredsize = 2*outsize;
4063 if (unicode_resize(output, requiredsize) < 0)
4064 goto onError;
4065 }
4066 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4067 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004068 *endinpos = newpos;
4069 *inptr = *input + newpos;
4070
4071 /* we made it! */
4072 Py_XDECREF(restuple);
4073 return 0;
4074
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004075 overflow:
4076 PyErr_SetString(PyExc_OverflowError,
4077 "decoded result is too long for a Python string");
4078
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004079 onError:
4080 Py_XDECREF(restuple);
4081 return -1;
4082}
4083#endif /* HAVE_MBCS */
4084
4085static int
4086unicode_decode_call_errorhandler_writer(
4087 const char *errors, PyObject **errorHandler,
4088 const char *encoding, const char *reason,
4089 const char **input, const char **inend, Py_ssize_t *startinpos,
4090 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4091 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4092{
4093 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4094
4095 PyObject *restuple = NULL;
4096 PyObject *repunicode = NULL;
4097 Py_ssize_t insize;
4098 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004099 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004100 PyObject *inputobj = NULL;
4101
4102 if (*errorHandler == NULL) {
4103 *errorHandler = PyCodec_LookupError(errors);
4104 if (*errorHandler == NULL)
4105 goto onError;
4106 }
4107
4108 make_decode_exception(exceptionObject,
4109 encoding,
4110 *input, *inend - *input,
4111 *startinpos, *endinpos,
4112 reason);
4113 if (*exceptionObject == NULL)
4114 goto onError;
4115
4116 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4117 if (restuple == NULL)
4118 goto onError;
4119 if (!PyTuple_Check(restuple)) {
4120 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4121 goto onError;
4122 }
4123 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004124 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004125
4126 /* Copy back the bytes variables, which might have been modified by the
4127 callback */
4128 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4129 if (!inputobj)
4130 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004131 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004133 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004134 *input = PyBytes_AS_STRING(inputobj);
4135 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004136 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004137 /* we can DECREF safely, as the exception has another reference,
4138 so the object won't go away. */
4139 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004140
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004143 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004144 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004145 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004146 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147
Victor Stinner8f674cc2013-04-17 23:02:17 +02004148 if (PyUnicode_READY(repunicode) < 0)
4149 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004150 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004151 if (replen > 1) {
4152 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004153 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004154 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4155 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4156 goto onError;
4157 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004158 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004159 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004160
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004162 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004165 Py_XDECREF(restuple);
4166 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004167
Benjamin Peterson29060642009-01-31 22:14:21 +00004168 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004169 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004170 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171}
4172
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004173/* --- UTF-7 Codec -------------------------------------------------------- */
4174
Antoine Pitrou244651a2009-05-04 18:56:13 +00004175/* See RFC2152 for details. We encode conservatively and decode liberally. */
4176
4177/* Three simple macros defining base-64. */
4178
4179/* Is c a base-64 character? */
4180
4181#define IS_BASE64(c) \
4182 (((c) >= 'A' && (c) <= 'Z') || \
4183 ((c) >= 'a' && (c) <= 'z') || \
4184 ((c) >= '0' && (c) <= '9') || \
4185 (c) == '+' || (c) == '/')
4186
4187/* given that c is a base-64 character, what is its base-64 value? */
4188
4189#define FROM_BASE64(c) \
4190 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4191 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4192 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4193 (c) == '+' ? 62 : 63)
4194
4195/* What is the base-64 character of the bottom 6 bits of n? */
4196
4197#define TO_BASE64(n) \
4198 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4199
4200/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4201 * decoded as itself. We are permissive on decoding; the only ASCII
4202 * byte not decoding to itself is the + which begins a base64
4203 * string. */
4204
4205#define DECODE_DIRECT(c) \
4206 ((c) <= 127 && (c) != '+')
4207
4208/* The UTF-7 encoder treats ASCII characters differently according to
4209 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4210 * the above). See RFC2152. This array identifies these different
4211 * sets:
4212 * 0 : "Set D"
4213 * alphanumeric and '(),-./:?
4214 * 1 : "Set O"
4215 * !"#$%&*;<=>@[]^_`{|}
4216 * 2 : "whitespace"
4217 * ht nl cr sp
4218 * 3 : special (must be base64 encoded)
4219 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4220 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004221
Tim Petersced69f82003-09-16 20:30:58 +00004222static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004223char utf7_category[128] = {
4224/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4225 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4226/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4227 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4228/* sp ! " # $ % & ' ( ) * + , - . / */
4229 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4230/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4232/* @ A B C D E F G H I J K L M N O */
4233 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4234/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4236/* ` a b c d e f g h i j k l m n o */
4237 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4238/* p q r s t u v w x y z { | } ~ del */
4239 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004240};
4241
Antoine Pitrou244651a2009-05-04 18:56:13 +00004242/* ENCODE_DIRECT: this character should be encoded as itself. The
4243 * answer depends on whether we are encoding set O as itself, and also
4244 * on whether we are encoding whitespace as itself. RFC2152 makes it
4245 * clear that the answers to these questions vary between
4246 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004247
Antoine Pitrou244651a2009-05-04 18:56:13 +00004248#define ENCODE_DIRECT(c, directO, directWS) \
4249 ((c) < 128 && (c) > 0 && \
4250 ((utf7_category[(c)] == 0) || \
4251 (directWS && (utf7_category[(c)] == 2)) || \
4252 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004253
Alexander Belopolsky40018472011-02-26 01:02:56 +00004254PyObject *
4255PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004256 Py_ssize_t size,
4257 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004258{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004259 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4260}
4261
Antoine Pitrou244651a2009-05-04 18:56:13 +00004262/* The decoder. The only state we preserve is our read position,
4263 * i.e. how many characters we have consumed. So if we end in the
4264 * middle of a shift sequence we have to back off the read position
4265 * and the output to the beginning of the sequence, otherwise we lose
4266 * all the shift state (seen bits, number of bits seen, high
4267 * surrogate). */
4268
Alexander Belopolsky40018472011-02-26 01:02:56 +00004269PyObject *
4270PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004271 Py_ssize_t size,
4272 const char *errors,
4273 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004274{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004275 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004276 Py_ssize_t startinpos;
4277 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004278 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004279 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280 const char *errmsg = "";
4281 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004282 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004283 unsigned int base64bits = 0;
4284 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004285 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 PyObject *errorHandler = NULL;
4287 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004289 if (size == 0) {
4290 if (consumed)
4291 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004292 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004293 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004295 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004296 _PyUnicodeWriter_Init(&writer);
4297 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298
4299 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004300 e = s + size;
4301
4302 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004303 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004304 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004305 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004306
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307 if (inShift) { /* in a base-64 section */
4308 if (IS_BASE64(ch)) { /* consume a base-64 character */
4309 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4310 base64bits += 6;
4311 s++;
4312 if (base64bits >= 16) {
4313 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004314 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315 base64bits -= 16;
4316 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004317 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 if (surrogate) {
4319 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004320 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4321 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004322 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004323 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004325 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004326 }
4327 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004328 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004329 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004330 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004331 }
4332 }
Victor Stinner551ac952011-11-29 22:58:13 +01004333 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 /* first surrogate */
4335 surrogate = outCh;
4336 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004337 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004338 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004339 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004340 }
4341 }
4342 }
4343 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004344 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 if (base64bits > 0) { /* left-over bits */
4346 if (base64bits >= 6) {
4347 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004348 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004349 errmsg = "partial character in shift sequence";
4350 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 else {
4353 /* Some bits remain; they should be zero */
4354 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004355 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004356 errmsg = "non-zero padding bits in shift sequence";
4357 goto utf7Error;
4358 }
4359 }
4360 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004361 if (surrogate && DECODE_DIRECT(ch)) {
4362 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4363 goto onError;
4364 }
4365 surrogate = 0;
4366 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 /* '-' is absorbed; other terminating
4368 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004369 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004370 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004371 }
4372 }
4373 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004374 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 s++; /* consume '+' */
4376 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004378 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004379 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 }
4381 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004382 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004383 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004384 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004386 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 }
4388 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004391 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004392 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394 else {
4395 startinpos = s-starts;
4396 s++;
4397 errmsg = "unexpected special character";
4398 goto utf7Error;
4399 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004401utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004402 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004403 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004404 errors, &errorHandler,
4405 "utf7", errmsg,
4406 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004407 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409 }
4410
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 /* end of string */
4412
4413 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4414 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004415 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 if (surrogate ||
4417 (base64bits >= 6) ||
4418 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004420 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 errors, &errorHandler,
4422 "utf7", "unterminated shift sequence",
4423 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004424 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425 goto onError;
4426 if (s < e)
4427 goto restart;
4428 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004429 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430
4431 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004432 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004434 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004435 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004436 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004437 writer.kind, writer.data, shiftOutStart);
4438 Py_XDECREF(errorHandler);
4439 Py_XDECREF(exc);
4440 _PyUnicodeWriter_Dealloc(&writer);
4441 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004442 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004443 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004444 }
4445 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004446 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004448 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004449
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004450 Py_XDECREF(errorHandler);
4451 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004452 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004453
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 Py_XDECREF(errorHandler);
4456 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004457 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458 return NULL;
4459}
4460
4461
Alexander Belopolsky40018472011-02-26 01:02:56 +00004462PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004463_PyUnicode_EncodeUTF7(PyObject *str,
4464 int base64SetO,
4465 int base64WhiteSpace,
4466 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004467{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004468 int kind;
4469 void *data;
4470 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004471 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004473 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004474 unsigned int base64bits = 0;
4475 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476 char * out;
4477 char * start;
4478
Benjamin Petersonbac79492012-01-14 13:34:47 -05004479 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004480 return NULL;
4481 kind = PyUnicode_KIND(str);
4482 data = PyUnicode_DATA(str);
4483 len = PyUnicode_GET_LENGTH(str);
4484
4485 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004487
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004488 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004489 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004490 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004491 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492 if (v == NULL)
4493 return NULL;
4494
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004495 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004496 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004497 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 if (inShift) {
4500 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4501 /* shifting out */
4502 if (base64bits) { /* output remaining bits */
4503 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4504 base64buffer = 0;
4505 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506 }
4507 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508 /* Characters not in the BASE64 set implicitly unshift the sequence
4509 so no '-' is required, except if the character is itself a '-' */
4510 if (IS_BASE64(ch) || ch == '-') {
4511 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004513 *out++ = (char) ch;
4514 }
4515 else {
4516 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004517 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519 else { /* not in a shift sequence */
4520 if (ch == '+') {
4521 *out++ = '+';
4522 *out++ = '-';
4523 }
4524 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4525 *out++ = (char) ch;
4526 }
4527 else {
4528 *out++ = '+';
4529 inShift = 1;
4530 goto encode_char;
4531 }
4532 }
4533 continue;
4534encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004536 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004537
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538 /* code first surrogate */
4539 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004540 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004541 while (base64bits >= 6) {
4542 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4543 base64bits -= 6;
4544 }
4545 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004546 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 base64bits += 16;
4549 base64buffer = (base64buffer << 16) | ch;
4550 while (base64bits >= 6) {
4551 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4552 base64bits -= 6;
4553 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004554 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555 if (base64bits)
4556 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4557 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004558 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004559 if (_PyBytes_Resize(&v, out - start) < 0)
4560 return NULL;
4561 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004562}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004563PyObject *
4564PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4565 Py_ssize_t size,
4566 int base64SetO,
4567 int base64WhiteSpace,
4568 const char *errors)
4569{
4570 PyObject *result;
4571 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4572 if (tmp == NULL)
4573 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004574 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004575 base64WhiteSpace, errors);
4576 Py_DECREF(tmp);
4577 return result;
4578}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580#undef IS_BASE64
4581#undef FROM_BASE64
4582#undef TO_BASE64
4583#undef DECODE_DIRECT
4584#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586/* --- UTF-8 Codec -------------------------------------------------------- */
4587
Alexander Belopolsky40018472011-02-26 01:02:56 +00004588PyObject *
4589PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004590 Py_ssize_t size,
4591 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592{
Walter Dörwald69652032004-09-07 20:24:22 +00004593 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4594}
4595
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004596#include "stringlib/asciilib.h"
4597#include "stringlib/codecs.h"
4598#include "stringlib/undef.h"
4599
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004600#include "stringlib/ucs1lib.h"
4601#include "stringlib/codecs.h"
4602#include "stringlib/undef.h"
4603
4604#include "stringlib/ucs2lib.h"
4605#include "stringlib/codecs.h"
4606#include "stringlib/undef.h"
4607
4608#include "stringlib/ucs4lib.h"
4609#include "stringlib/codecs.h"
4610#include "stringlib/undef.h"
4611
Antoine Pitrouab868312009-01-10 15:40:25 +00004612/* Mask to quickly check whether a C 'long' contains a
4613 non-ASCII, UTF8-encoded char. */
4614#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004615# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004616#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004617# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004618#else
4619# error C 'long' size should be either 4 or 8!
4620#endif
4621
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004622static Py_ssize_t
4623ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004624{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004625 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004626 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004627
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004628 /*
4629 * Issue #17237: m68k is a bit different from most architectures in
4630 * that objects do not use "natural alignment" - for example, int and
4631 * long are only aligned at 2-byte boundaries. Therefore the assert()
4632 * won't work; also, tests have shown that skipping the "optimised
4633 * version" will even speed up m68k.
4634 */
4635#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004636#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004637 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4638 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004639 /* Fast path, see in STRINGLIB(utf8_decode) for
4640 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004641 /* Help allocation */
4642 const char *_p = p;
4643 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004644 while (_p < aligned_end) {
4645 unsigned long value = *(const unsigned long *) _p;
4646 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004647 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648 *((unsigned long *)q) = value;
4649 _p += SIZEOF_LONG;
4650 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004651 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004652 p = _p;
4653 while (p < end) {
4654 if ((unsigned char)*p & 0x80)
4655 break;
4656 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004658 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004660#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004661#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662 while (p < end) {
4663 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4664 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004665 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004666 /* Help allocation */
4667 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004668 while (_p < aligned_end) {
4669 unsigned long value = *(unsigned long *) _p;
4670 if (value & ASCII_CHAR_MASK)
4671 break;
4672 _p += SIZEOF_LONG;
4673 }
4674 p = _p;
4675 if (_p == end)
4676 break;
4677 }
4678 if ((unsigned char)*p & 0x80)
4679 break;
4680 ++p;
4681 }
4682 memcpy(dest, start, p - start);
4683 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684}
Antoine Pitrouab868312009-01-10 15:40:25 +00004685
Victor Stinner785938e2011-12-11 20:09:03 +01004686PyObject *
4687PyUnicode_DecodeUTF8Stateful(const char *s,
4688 Py_ssize_t size,
4689 const char *errors,
4690 Py_ssize_t *consumed)
4691{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004692 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004693 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004694 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695
4696 Py_ssize_t startinpos;
4697 Py_ssize_t endinpos;
4698 const char *errmsg = "";
4699 PyObject *errorHandler = NULL;
4700 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004701
4702 if (size == 0) {
4703 if (consumed)
4704 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004705 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004706 }
4707
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004708 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4709 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004710 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004711 *consumed = 1;
4712 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004713 }
4714
Victor Stinner8f674cc2013-04-17 23:02:17 +02004715 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004716 writer.min_length = size;
4717 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004719
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004720 writer.pos = ascii_decode(s, end, writer.data);
4721 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722 while (s < end) {
4723 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004724 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004725 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004726 if (PyUnicode_IS_ASCII(writer.buffer))
4727 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004729 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004731 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004732 } else {
4733 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004734 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735 }
4736
4737 switch (ch) {
4738 case 0:
4739 if (s == end || consumed)
4740 goto End;
4741 errmsg = "unexpected end of data";
4742 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004743 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004744 break;
4745 case 1:
4746 errmsg = "invalid start byte";
4747 startinpos = s - starts;
4748 endinpos = startinpos + 1;
4749 break;
4750 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004751 case 3:
4752 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004753 errmsg = "invalid continuation byte";
4754 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004755 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756 break;
4757 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004758 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004759 goto onError;
4760 continue;
4761 }
4762
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004763 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764 errors, &errorHandler,
4765 "utf-8", errmsg,
4766 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004767 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004768 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004769 }
4770
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772 if (consumed)
4773 *consumed = s - starts;
4774
4775 Py_XDECREF(errorHandler);
4776 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004777 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004778
4779onError:
4780 Py_XDECREF(errorHandler);
4781 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004782 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004783 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004784}
4785
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004786#ifdef __APPLE__
4787
4788/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004789 used to decode the command line arguments on Mac OS X.
4790
4791 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004792 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004793
4794wchar_t*
4795_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4796{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004797 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798 wchar_t *unicode;
4799 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004800
4801 /* Note: size will always be longer than the resulting Unicode
4802 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004803 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004804 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004805 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004806 if (!unicode)
4807 return NULL;
4808
4809 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004810 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004812 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004813 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004814#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004815 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004816#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004817 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004818#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004819 if (ch > 0xFF) {
4820#if SIZEOF_WCHAR_T == 4
4821 assert(0);
4822#else
4823 assert(Py_UNICODE_IS_SURROGATE(ch));
4824 /* compute and append the two surrogates: */
4825 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4826 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4827#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004828 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004829 else {
4830 if (!ch && s == e)
4831 break;
4832 /* surrogateescape */
4833 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4834 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004836 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004837 return unicode;
4838}
4839
4840#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004842/* Primary internal function which creates utf8 encoded bytes objects.
4843
4844 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004845 and allocate exactly as much space needed at the end. Else allocate the
4846 maximum possible needed (4 result bytes per Unicode character), and return
4847 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004848*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004849PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004850_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851{
Victor Stinner6099a032011-12-18 14:22:26 +01004852 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004853 void *data;
4854 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004856 if (!PyUnicode_Check(unicode)) {
4857 PyErr_BadArgument();
4858 return NULL;
4859 }
4860
4861 if (PyUnicode_READY(unicode) == -1)
4862 return NULL;
4863
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004864 if (PyUnicode_UTF8(unicode))
4865 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4866 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004867
4868 kind = PyUnicode_KIND(unicode);
4869 data = PyUnicode_DATA(unicode);
4870 size = PyUnicode_GET_LENGTH(unicode);
4871
Benjamin Petersonead6b532011-12-20 17:23:42 -06004872 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004873 default:
4874 assert(0);
4875 case PyUnicode_1BYTE_KIND:
4876 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4877 assert(!PyUnicode_IS_ASCII(unicode));
4878 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4879 case PyUnicode_2BYTE_KIND:
4880 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4881 case PyUnicode_4BYTE_KIND:
4882 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004883 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884}
4885
Alexander Belopolsky40018472011-02-26 01:02:56 +00004886PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004887PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4888 Py_ssize_t size,
4889 const char *errors)
4890{
4891 PyObject *v, *unicode;
4892
4893 unicode = PyUnicode_FromUnicode(s, size);
4894 if (unicode == NULL)
4895 return NULL;
4896 v = _PyUnicode_AsUTF8String(unicode, errors);
4897 Py_DECREF(unicode);
4898 return v;
4899}
4900
4901PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004902PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004904 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905}
4906
Walter Dörwald41980ca2007-08-16 21:55:45 +00004907/* --- UTF-32 Codec ------------------------------------------------------- */
4908
4909PyObject *
4910PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004911 Py_ssize_t size,
4912 const char *errors,
4913 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004914{
4915 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4916}
4917
4918PyObject *
4919PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 Py_ssize_t size,
4921 const char *errors,
4922 int *byteorder,
4923 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004924{
4925 const char *starts = s;
4926 Py_ssize_t startinpos;
4927 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004928 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004929 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004930 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004931 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004932 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004933 PyObject *errorHandler = NULL;
4934 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004935
Walter Dörwald41980ca2007-08-16 21:55:45 +00004936 q = (unsigned char *)s;
4937 e = q + size;
4938
4939 if (byteorder)
4940 bo = *byteorder;
4941
4942 /* Check for BOM marks (U+FEFF) in the input and adjust current
4943 byte order setting accordingly. In native mode, the leading BOM
4944 mark is skipped, in all other modes, it is copied to the output
4945 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004946 if (bo == 0 && size >= 4) {
4947 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4948 if (bom == 0x0000FEFF) {
4949 bo = -1;
4950 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004952 else if (bom == 0xFFFE0000) {
4953 bo = 1;
4954 q += 4;
4955 }
4956 if (byteorder)
4957 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958 }
4959
Victor Stinnere64322e2012-10-30 23:12:47 +01004960 if (q == e) {
4961 if (consumed)
4962 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004963 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004964 }
4965
Victor Stinnere64322e2012-10-30 23:12:47 +01004966#ifdef WORDS_BIGENDIAN
4967 le = bo < 0;
4968#else
4969 le = bo <= 0;
4970#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004971 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004972
Victor Stinner8f674cc2013-04-17 23:02:17 +02004973 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004974 writer.min_length = (e - q + 3) / 4;
4975 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004976 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004977
Victor Stinnere64322e2012-10-30 23:12:47 +01004978 while (1) {
4979 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004980 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004981
Victor Stinnere64322e2012-10-30 23:12:47 +01004982 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004983 enum PyUnicode_Kind kind = writer.kind;
4984 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004985 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004986 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004987 if (le) {
4988 do {
4989 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4990 if (ch > maxch)
4991 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004992 if (kind != PyUnicode_1BYTE_KIND &&
4993 Py_UNICODE_IS_SURROGATE(ch))
4994 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004995 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004996 q += 4;
4997 } while (q <= last);
4998 }
4999 else {
5000 do {
5001 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5002 if (ch > maxch)
5003 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005004 if (kind != PyUnicode_1BYTE_KIND &&
5005 Py_UNICODE_IS_SURROGATE(ch))
5006 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005007 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 q += 4;
5009 } while (q <= last);
5010 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005011 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005012 }
5013
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005014 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005015 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005016 startinpos = ((const char *)q) - starts;
5017 endinpos = startinpos + 4;
5018 }
5019 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005020 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005022 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005023 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005024 startinpos = ((const char *)q) - starts;
5025 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005027 else {
5028 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005029 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005030 goto onError;
5031 q += 4;
5032 continue;
5033 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005034 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005035 startinpos = ((const char *)q) - starts;
5036 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005038
5039 /* The remaining input chars are ignored if the callback
5040 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005041 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005043 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005045 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005047 }
5048
Walter Dörwald41980ca2007-08-16 21:55:45 +00005049 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051
Walter Dörwald41980ca2007-08-16 21:55:45 +00005052 Py_XDECREF(errorHandler);
5053 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005054 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005057 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005058 Py_XDECREF(errorHandler);
5059 Py_XDECREF(exc);
5060 return NULL;
5061}
5062
5063PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005064_PyUnicode_EncodeUTF32(PyObject *str,
5065 const char *errors,
5066 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005067{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005068 enum PyUnicode_Kind kind;
5069 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005070 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005071 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005072 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005073#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005074 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005076 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005078 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005079 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005080 PyObject *errorHandler = NULL;
5081 PyObject *exc = NULL;
5082 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005084 if (!PyUnicode_Check(str)) {
5085 PyErr_BadArgument();
5086 return NULL;
5087 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005088 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005089 return NULL;
5090 kind = PyUnicode_KIND(str);
5091 data = PyUnicode_DATA(str);
5092 len = PyUnicode_GET_LENGTH(str);
5093
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005094 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005095 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005096 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005097 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 if (v == NULL)
5099 return NULL;
5100
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005101 /* output buffer is 4-bytes aligned */
5102 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5103 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005105 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005106 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005107 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005109 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005110 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005111 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005112 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005113 else
5114 encoding = "utf-32";
5115
5116 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005117 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5118 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119 }
5120
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005121 pos = 0;
5122 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005123 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005124
5125 if (kind == PyUnicode_2BYTE_KIND) {
5126 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5127 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005128 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005129 else {
5130 assert(kind == PyUnicode_4BYTE_KIND);
5131 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5132 &out, native_ordering);
5133 }
5134 if (pos == len)
5135 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005136
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005137 rep = unicode_encode_call_errorhandler(
5138 errors, &errorHandler,
5139 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005140 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005141 if (!rep)
5142 goto error;
5143
5144 if (PyBytes_Check(rep)) {
5145 repsize = PyBytes_GET_SIZE(rep);
5146 if (repsize & 3) {
5147 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005148 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005149 "surrogates not allowed");
5150 goto error;
5151 }
5152 moreunits = repsize / 4;
5153 }
5154 else {
5155 assert(PyUnicode_Check(rep));
5156 if (PyUnicode_READY(rep) < 0)
5157 goto error;
5158 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5159 if (!PyUnicode_IS_ASCII(rep)) {
5160 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005161 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005162 "surrogates not allowed");
5163 goto error;
5164 }
5165 }
5166
5167 /* four bytes are reserved for each surrogate */
5168 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005169 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005170 Py_ssize_t morebytes = 4 * (moreunits - 1);
5171 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5172 /* integer overflow */
5173 PyErr_NoMemory();
5174 goto error;
5175 }
5176 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5177 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005178 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005179 }
5180
5181 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005182 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5183 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005184 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005185 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005186 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5187 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005188 }
5189
5190 Py_CLEAR(rep);
5191 }
5192
5193 /* Cut back to size actually needed. This is necessary for, for example,
5194 encoding of a string containing isolated surrogates and the 'ignore'
5195 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005196 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005197 if (nsize != PyBytes_GET_SIZE(v))
5198 _PyBytes_Resize(&v, nsize);
5199 Py_XDECREF(errorHandler);
5200 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005201 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005202 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005203 error:
5204 Py_XDECREF(rep);
5205 Py_XDECREF(errorHandler);
5206 Py_XDECREF(exc);
5207 Py_XDECREF(v);
5208 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005209}
5210
Alexander Belopolsky40018472011-02-26 01:02:56 +00005211PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005212PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5213 Py_ssize_t size,
5214 const char *errors,
5215 int byteorder)
5216{
5217 PyObject *result;
5218 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5219 if (tmp == NULL)
5220 return NULL;
5221 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5222 Py_DECREF(tmp);
5223 return result;
5224}
5225
5226PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005227PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005228{
Victor Stinnerb960b342011-11-20 19:12:52 +01005229 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005230}
5231
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232/* --- UTF-16 Codec ------------------------------------------------------- */
5233
Tim Peters772747b2001-08-09 22:21:55 +00005234PyObject *
5235PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 Py_ssize_t size,
5237 const char *errors,
5238 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239{
Walter Dörwald69652032004-09-07 20:24:22 +00005240 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5241}
5242
5243PyObject *
5244PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005245 Py_ssize_t size,
5246 const char *errors,
5247 int *byteorder,
5248 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005249{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005251 Py_ssize_t startinpos;
5252 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005253 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005254 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005255 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005256 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005257 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005258 PyObject *errorHandler = NULL;
5259 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005260 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261
Tim Peters772747b2001-08-09 22:21:55 +00005262 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005263 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264
5265 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005266 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005268 /* Check for BOM marks (U+FEFF) in the input and adjust current
5269 byte order setting accordingly. In native mode, the leading BOM
5270 mark is skipped, in all other modes, it is copied to the output
5271 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005272 if (bo == 0 && size >= 2) {
5273 const Py_UCS4 bom = (q[1] << 8) | q[0];
5274 if (bom == 0xFEFF) {
5275 q += 2;
5276 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005278 else if (bom == 0xFFFE) {
5279 q += 2;
5280 bo = 1;
5281 }
5282 if (byteorder)
5283 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285
Antoine Pitrou63065d72012-05-15 23:48:04 +02005286 if (q == e) {
5287 if (consumed)
5288 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005289 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005290 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005291
Christian Heimes743e0cd2012-10-17 23:52:17 +02005292#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005293 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005294 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005295#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005296 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005297 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005298#endif
Tim Peters772747b2001-08-09 22:21:55 +00005299
Antoine Pitrou63065d72012-05-15 23:48:04 +02005300 /* Note: size will always be longer than the resulting Unicode
5301 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005302 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005303 writer.min_length = (e - q + 1) / 2;
5304 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005305 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005306
Antoine Pitrou63065d72012-05-15 23:48:04 +02005307 while (1) {
5308 Py_UCS4 ch = 0;
5309 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005310 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005311 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005312 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005313 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005314 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005315 native_ordering);
5316 else
5317 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005318 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005319 native_ordering);
5320 } else if (kind == PyUnicode_2BYTE_KIND) {
5321 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005322 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005323 native_ordering);
5324 } else {
5325 assert(kind == PyUnicode_4BYTE_KIND);
5326 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005327 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005328 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005329 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005330 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005331
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332 switch (ch)
5333 {
5334 case 0:
5335 /* remaining byte at the end? (size should be even) */
5336 if (q == e || consumed)
5337 goto End;
5338 errmsg = "truncated data";
5339 startinpos = ((const char *)q) - starts;
5340 endinpos = ((const char *)e) - starts;
5341 break;
5342 /* The remaining input chars are ignored if the callback
5343 chooses to skip the input */
5344 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005345 q -= 2;
5346 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005347 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005348 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005349 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005350 endinpos = ((const char *)e) - starts;
5351 break;
5352 case 2:
5353 errmsg = "illegal encoding";
5354 startinpos = ((const char *)q) - 2 - starts;
5355 endinpos = startinpos + 2;
5356 break;
5357 case 3:
5358 errmsg = "illegal UTF-16 surrogate";
5359 startinpos = ((const char *)q) - 4 - starts;
5360 endinpos = startinpos + 2;
5361 break;
5362 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005363 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005364 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 continue;
5366 }
5367
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005368 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005369 errors,
5370 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005371 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005372 &starts,
5373 (const char **)&e,
5374 &startinpos,
5375 &endinpos,
5376 &exc,
5377 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005378 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380 }
5381
Antoine Pitrou63065d72012-05-15 23:48:04 +02005382End:
Walter Dörwald69652032004-09-07 20:24:22 +00005383 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 Py_XDECREF(errorHandler);
5387 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005388 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005391 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005392 Py_XDECREF(errorHandler);
5393 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 return NULL;
5395}
5396
Tim Peters772747b2001-08-09 22:21:55 +00005397PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005398_PyUnicode_EncodeUTF16(PyObject *str,
5399 const char *errors,
5400 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005402 enum PyUnicode_Kind kind;
5403 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005404 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005405 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005406 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005407 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005408#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005409 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005410#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005411 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005412#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005413 const char *encoding;
5414 Py_ssize_t nsize, pos;
5415 PyObject *errorHandler = NULL;
5416 PyObject *exc = NULL;
5417 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005418
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005419 if (!PyUnicode_Check(str)) {
5420 PyErr_BadArgument();
5421 return NULL;
5422 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005423 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005424 return NULL;
5425 kind = PyUnicode_KIND(str);
5426 data = PyUnicode_DATA(str);
5427 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005428
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005429 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005430 if (kind == PyUnicode_4BYTE_KIND) {
5431 const Py_UCS4 *in = (const Py_UCS4 *)data;
5432 const Py_UCS4 *end = in + len;
5433 while (in < end)
5434 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005435 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005436 }
5437 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005439 nsize = len + pairs + (byteorder == 0);
5440 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 if (v == NULL)
5442 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005444 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005445 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005446 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005448 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005449 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005450 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005451
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005452 if (kind == PyUnicode_1BYTE_KIND) {
5453 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5454 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005455 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005456
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005457 if (byteorder < 0)
5458 encoding = "utf-16-le";
5459 else if (byteorder > 0)
5460 encoding = "utf-16-be";
5461 else
5462 encoding = "utf-16";
5463
5464 pos = 0;
5465 while (pos < len) {
5466 Py_ssize_t repsize, moreunits;
5467
5468 if (kind == PyUnicode_2BYTE_KIND) {
5469 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5470 &out, native_ordering);
5471 }
5472 else {
5473 assert(kind == PyUnicode_4BYTE_KIND);
5474 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5475 &out, native_ordering);
5476 }
5477 if (pos == len)
5478 break;
5479
5480 rep = unicode_encode_call_errorhandler(
5481 errors, &errorHandler,
5482 encoding, "surrogates not allowed",
5483 str, &exc, pos, pos + 1, &pos);
5484 if (!rep)
5485 goto error;
5486
5487 if (PyBytes_Check(rep)) {
5488 repsize = PyBytes_GET_SIZE(rep);
5489 if (repsize & 1) {
5490 raise_encode_exception(&exc, encoding,
5491 str, pos - 1, pos,
5492 "surrogates not allowed");
5493 goto error;
5494 }
5495 moreunits = repsize / 2;
5496 }
5497 else {
5498 assert(PyUnicode_Check(rep));
5499 if (PyUnicode_READY(rep) < 0)
5500 goto error;
5501 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5502 if (!PyUnicode_IS_ASCII(rep)) {
5503 raise_encode_exception(&exc, encoding,
5504 str, pos - 1, pos,
5505 "surrogates not allowed");
5506 goto error;
5507 }
5508 }
5509
5510 /* two bytes are reserved for each surrogate */
5511 if (moreunits > 1) {
5512 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5513 Py_ssize_t morebytes = 2 * (moreunits - 1);
5514 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5515 /* integer overflow */
5516 PyErr_NoMemory();
5517 goto error;
5518 }
5519 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5520 goto error;
5521 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5522 }
5523
5524 if (PyBytes_Check(rep)) {
5525 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5526 out += moreunits;
5527 } else /* rep is unicode */ {
5528 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5529 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5530 &out, native_ordering);
5531 }
5532
5533 Py_CLEAR(rep);
5534 }
5535
5536 /* Cut back to size actually needed. This is necessary for, for example,
5537 encoding of a string containing isolated surrogates and the 'ignore' handler
5538 is used. */
5539 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5540 if (nsize != PyBytes_GET_SIZE(v))
5541 _PyBytes_Resize(&v, nsize);
5542 Py_XDECREF(errorHandler);
5543 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005544 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005545 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005546 error:
5547 Py_XDECREF(rep);
5548 Py_XDECREF(errorHandler);
5549 Py_XDECREF(exc);
5550 Py_XDECREF(v);
5551 return NULL;
5552#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553}
5554
Alexander Belopolsky40018472011-02-26 01:02:56 +00005555PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005556PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5557 Py_ssize_t size,
5558 const char *errors,
5559 int byteorder)
5560{
5561 PyObject *result;
5562 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5563 if (tmp == NULL)
5564 return NULL;
5565 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5566 Py_DECREF(tmp);
5567 return result;
5568}
5569
5570PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005571PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005573 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574}
5575
5576/* --- Unicode Escape Codec ----------------------------------------------- */
5577
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5579 if all the escapes in the string make it still a valid ASCII string.
5580 Returns -1 if any escapes were found which cause the string to
5581 pop out of ASCII range. Otherwise returns the length of the
5582 required buffer to hold the string.
5583 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005584static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005585length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5586{
5587 const unsigned char *p = (const unsigned char *)s;
5588 const unsigned char *end = p + size;
5589 Py_ssize_t length = 0;
5590
5591 if (size < 0)
5592 return -1;
5593
5594 for (; p < end; ++p) {
5595 if (*p > 127) {
5596 /* Non-ASCII */
5597 return -1;
5598 }
5599 else if (*p != '\\') {
5600 /* Normal character */
5601 ++length;
5602 }
5603 else {
5604 /* Backslash-escape, check next char */
5605 ++p;
5606 /* Escape sequence reaches till end of string or
5607 non-ASCII follow-up. */
5608 if (p >= end || *p > 127)
5609 return -1;
5610 switch (*p) {
5611 case '\n':
5612 /* backslash + \n result in zero characters */
5613 break;
5614 case '\\': case '\'': case '\"':
5615 case 'b': case 'f': case 't':
5616 case 'n': case 'r': case 'v': case 'a':
5617 ++length;
5618 break;
5619 case '0': case '1': case '2': case '3':
5620 case '4': case '5': case '6': case '7':
5621 case 'x': case 'u': case 'U': case 'N':
5622 /* these do not guarantee ASCII characters */
5623 return -1;
5624 default:
5625 /* count the backslash + the other character */
5626 length += 2;
5627 }
5628 }
5629 }
5630 return length;
5631}
5632
Fredrik Lundh06d12682001-01-24 07:59:11 +00005633static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005634
Alexander Belopolsky40018472011-02-26 01:02:56 +00005635PyObject *
5636PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005637 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005638 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005641 Py_ssize_t startinpos;
5642 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005643 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005645 char* message;
5646 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 PyObject *errorHandler = NULL;
5648 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005649 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005650
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005651 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005652 if (len == 0)
5653 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005654
5655 /* After length_of_escaped_ascii_string() there are two alternatives,
5656 either the string is pure ASCII with named escapes like \n, etc.
5657 and we determined it's exact size (common case)
5658 or it contains \x, \u, ... escape sequences. then we create a
5659 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005660 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005661 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005662 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005663 }
5664 else {
5665 /* Escaped strings will always be longer than the resulting
5666 Unicode string, so we start with size here and then reduce the
5667 length after conversion to the true value.
5668 (but if the error callback returns a long replacement string
5669 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005670 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005671 }
5672
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005674 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005676
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 while (s < end) {
5678 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005679 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005680 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
5682 /* Non-escape characters are interpreted as Unicode ordinals */
5683 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005684 x = (unsigned char)*s;
5685 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005686 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005687 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 continue;
5689 }
5690
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 /* \ - Escapes */
5693 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005694 c = *s++;
5695 if (s > end)
5696 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005697
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005698 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005701#define WRITECHAR(ch) \
5702 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005703 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005704 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005705 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005706
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005708 case '\\': WRITECHAR('\\'); break;
5709 case '\'': WRITECHAR('\''); break;
5710 case '\"': WRITECHAR('\"'); break;
5711 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005712 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005713 case 'f': WRITECHAR('\014'); break;
5714 case 't': WRITECHAR('\t'); break;
5715 case 'n': WRITECHAR('\n'); break;
5716 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005717 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005718 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005719 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005720 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723 case '0': case '1': case '2': case '3':
5724 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005725 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005726 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005727 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005728 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005729 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005731 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 break;
5733
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 /* hex escapes */
5735 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005737 digits = 2;
5738 message = "truncated \\xXX escape";
5739 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005743 digits = 4;
5744 message = "truncated \\uXXXX escape";
5745 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005748 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005749 digits = 8;
5750 message = "truncated \\UXXXXXXXX escape";
5751 hexescape:
5752 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005753 if (end - s < digits) {
5754 /* count only hex digits */
5755 for (; s < end; ++s) {
5756 c = (unsigned char)*s;
5757 if (!Py_ISXDIGIT(c))
5758 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005759 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005760 goto error;
5761 }
5762 for (; digits--; ++s) {
5763 c = (unsigned char)*s;
5764 if (!Py_ISXDIGIT(c))
5765 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005766 chr = (chr<<4) & ~0xF;
5767 if (c >= '0' && c <= '9')
5768 chr += c - '0';
5769 else if (c >= 'a' && c <= 'f')
5770 chr += 10 + c - 'a';
5771 else
5772 chr += 10 + c - 'A';
5773 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005774 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 /* _decoding_error will have already written into the
5776 target buffer. */
5777 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005778 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005779 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005780 message = "illegal Unicode character";
5781 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005782 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005783 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005784 break;
5785
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005787 case 'N':
5788 message = "malformed \\N character escape";
5789 if (ucnhash_CAPI == NULL) {
5790 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005791 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5792 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005793 if (ucnhash_CAPI == NULL)
5794 goto ucnhashError;
5795 }
5796 if (*s == '{') {
5797 const char *start = s+1;
5798 /* look for the closing brace */
5799 while (*s != '}' && s < end)
5800 s++;
5801 if (s > start && s < end && *s == '}') {
5802 /* found a name. look it up in the unicode database */
5803 message = "unknown Unicode character name";
5804 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005805 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005806 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005807 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005808 goto store;
5809 }
5810 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005811 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005812
5813 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005814 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005815 message = "\\ at end of string";
5816 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005817 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005818 }
5819 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005820 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005821 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005822 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005823 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005825 continue;
5826
5827 error:
5828 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005829 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005830 errors, &errorHandler,
5831 "unicodeescape", message,
5832 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005833 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005834 goto onError;
5835 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005837#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005838
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005839 Py_XDECREF(errorHandler);
5840 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005841 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005842
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005844 PyErr_SetString(
5845 PyExc_UnicodeError,
5846 "\\N escapes not supported (can't load unicodedata module)"
5847 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005848 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 Py_XDECREF(errorHandler);
5850 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005851 return NULL;
5852
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005854 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855 Py_XDECREF(errorHandler);
5856 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 return NULL;
5858}
5859
5860/* Return a Unicode-Escape string version of the Unicode object.
5861
5862 If quotes is true, the string is enclosed in u"" or u'' quotes as
5863 appropriate.
5864
5865*/
5866
Alexander Belopolsky40018472011-02-26 01:02:56 +00005867PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005868PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005870 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005871 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005873 int kind;
5874 void *data;
5875 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876
Ezio Melottie7f90372012-10-05 03:33:31 +03005877 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005878 escape.
5879
Ezio Melottie7f90372012-10-05 03:33:31 +03005880 For UCS1 strings it's '\xxx', 4 bytes per source character.
5881 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5882 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005883 */
5884
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005885 if (!PyUnicode_Check(unicode)) {
5886 PyErr_BadArgument();
5887 return NULL;
5888 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005889 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005890 return NULL;
5891 len = PyUnicode_GET_LENGTH(unicode);
5892 kind = PyUnicode_KIND(unicode);
5893 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005894 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005895 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5896 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5897 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5898 }
5899
5900 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005901 return PyBytes_FromStringAndSize(NULL, 0);
5902
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005903 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005904 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005905
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005906 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005908 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 if (repr == NULL)
5911 return NULL;
5912
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005913 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005915 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005916 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005917
Walter Dörwald79e913e2007-05-12 11:08:06 +00005918 /* Escape backslashes */
5919 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 *p++ = '\\';
5921 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005922 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005923 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005924
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005925 /* Map 21-bit characters to '\U00xxxxxx' */
5926 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005927 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005928 *p++ = '\\';
5929 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005930 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5931 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5932 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5933 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5934 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5935 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5936 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5937 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005939 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005940
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005942 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 *p++ = '\\';
5944 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005945 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5946 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5947 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5948 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005950
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005951 /* Map special whitespace to '\t', \n', '\r' */
5952 else if (ch == '\t') {
5953 *p++ = '\\';
5954 *p++ = 't';
5955 }
5956 else if (ch == '\n') {
5957 *p++ = '\\';
5958 *p++ = 'n';
5959 }
5960 else if (ch == '\r') {
5961 *p++ = '\\';
5962 *p++ = 'r';
5963 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005964
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005965 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005966 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005968 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005969 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5970 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005972
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 /* Copy everything else as-is */
5974 else
5975 *p++ = (char) ch;
5976 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005978 assert(p - PyBytes_AS_STRING(repr) > 0);
5979 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5980 return NULL;
5981 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982}
5983
Alexander Belopolsky40018472011-02-26 01:02:56 +00005984PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005985PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5986 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005988 PyObject *result;
5989 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5990 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005992 result = PyUnicode_AsUnicodeEscapeString(tmp);
5993 Py_DECREF(tmp);
5994 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995}
5996
5997/* --- Raw Unicode Escape Codec ------------------------------------------- */
5998
Alexander Belopolsky40018472011-02-26 01:02:56 +00005999PyObject *
6000PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006001 Py_ssize_t size,
6002 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006005 Py_ssize_t startinpos;
6006 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006007 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 const char *end;
6009 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 PyObject *errorHandler = NULL;
6011 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006012
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006013 if (size == 0)
6014 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006015
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 /* Escaped strings will always be longer than the resulting
6017 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006018 length after conversion to the true value. (But decoding error
6019 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006020 _PyUnicodeWriter_Init(&writer);
6021 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006022
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 end = s + size;
6024 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 unsigned char c;
6026 Py_UCS4 x;
6027 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006028 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 /* Non-escape characters are interpreted as Unicode ordinals */
6031 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006032 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006033 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006034 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006036 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 startinpos = s-starts;
6038
6039 /* \u-escapes are only interpreted iff the number of leading
6040 backslashes if odd */
6041 bs = s;
6042 for (;s < end;) {
6043 if (*s != '\\')
6044 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006045 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006046 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006047 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 }
6049 if (((s - bs) & 1) == 0 ||
6050 s >= end ||
6051 (*s != 'u' && *s != 'U')) {
6052 continue;
6053 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006054 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 count = *s=='u' ? 4 : 8;
6056 s++;
6057
6058 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 for (x = 0, i = 0; i < count; ++i, ++s) {
6060 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006061 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006063 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006064 errors, &errorHandler,
6065 "rawunicodeescape", "truncated \\uXXXX",
6066 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006067 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 goto onError;
6069 goto nextByte;
6070 }
6071 x = (x<<4) & ~0xF;
6072 if (c >= '0' && c <= '9')
6073 x += c - '0';
6074 else if (c >= 'a' && c <= 'f')
6075 x += 10 + c - 'a';
6076 else
6077 x += 10 + c - 'A';
6078 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006079 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006080 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006081 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006082 }
6083 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006084 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006085 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006086 errors, &errorHandler,
6087 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006089 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006090 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006091 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 nextByte:
6093 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006095 Py_XDECREF(errorHandler);
6096 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006097 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006098
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006100 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006101 Py_XDECREF(errorHandler);
6102 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 return NULL;
6104}
6105
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006106
Alexander Belopolsky40018472011-02-26 01:02:56 +00006107PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006108PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006110 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 char *p;
6112 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006113 Py_ssize_t expandsize, pos;
6114 int kind;
6115 void *data;
6116 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006118 if (!PyUnicode_Check(unicode)) {
6119 PyErr_BadArgument();
6120 return NULL;
6121 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006122 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006123 return NULL;
6124 kind = PyUnicode_KIND(unicode);
6125 data = PyUnicode_DATA(unicode);
6126 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006127 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6128 bytes, and 1 byte characters 4. */
6129 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006130
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006131 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006133
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006134 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135 if (repr == NULL)
6136 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006137 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006138 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006140 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006141 for (pos = 0; pos < len; pos++) {
6142 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 /* Map 32-bit characters to '\Uxxxxxxxx' */
6144 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006145 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006146 *p++ = '\\';
6147 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006148 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6149 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6150 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6151 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6152 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6153 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6154 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6155 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006156 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 *p++ = '\\';
6160 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006161 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6162 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6163 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6164 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 /* Copy everything else as-is */
6167 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 *p++ = (char) ch;
6169 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006170
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006171 assert(p > q);
6172 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006173 return NULL;
6174 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175}
6176
Alexander Belopolsky40018472011-02-26 01:02:56 +00006177PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006178PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6179 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006181 PyObject *result;
6182 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6183 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006184 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6186 Py_DECREF(tmp);
6187 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188}
6189
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006190/* --- Unicode Internal Codec ------------------------------------------- */
6191
Alexander Belopolsky40018472011-02-26 01:02:56 +00006192PyObject *
6193_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006194 Py_ssize_t size,
6195 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006196{
6197 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006198 Py_ssize_t startinpos;
6199 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006200 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006201 const char *end;
6202 const char *reason;
6203 PyObject *errorHandler = NULL;
6204 PyObject *exc = NULL;
6205
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006206 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006207 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006208 1))
6209 return NULL;
6210
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006211 if (size == 0)
6212 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006213
Victor Stinner8f674cc2013-04-17 23:02:17 +02006214 _PyUnicodeWriter_Init(&writer);
6215 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6216 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006218 }
6219 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006220
Victor Stinner8f674cc2013-04-17 23:02:17 +02006221 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006222 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006223 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006224 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006225 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006226 endinpos = end-starts;
6227 reason = "truncated input";
6228 goto error;
6229 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006230 /* We copy the raw representation one byte at a time because the
6231 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006232 ((char *) &uch)[0] = s[0];
6233 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006234#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006235 ((char *) &uch)[2] = s[2];
6236 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006237#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006238 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006239#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006240 /* We have to sanity check the raw data, otherwise doom looms for
6241 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006242 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006243 endinpos = s - starts + Py_UNICODE_SIZE;
6244 reason = "illegal code point (> 0x10FFFF)";
6245 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006246 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006247#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006248 s += Py_UNICODE_SIZE;
6249#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006250 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006251 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006252 Py_UNICODE uch2;
6253 ((char *) &uch2)[0] = s[0];
6254 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006255 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006256 {
Victor Stinner551ac952011-11-29 22:58:13 +01006257 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006258 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006259 }
6260 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006261#endif
6262
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006263 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006264 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006265 continue;
6266
6267 error:
6268 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006269 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006270 errors, &errorHandler,
6271 "unicode_internal", reason,
6272 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006273 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006274 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006275 }
6276
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 Py_XDECREF(errorHandler);
6278 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006279 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006280
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006282 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006283 Py_XDECREF(errorHandler);
6284 Py_XDECREF(exc);
6285 return NULL;
6286}
6287
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288/* --- Latin-1 Codec ------------------------------------------------------ */
6289
Alexander Belopolsky40018472011-02-26 01:02:56 +00006290PyObject *
6291PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006292 Py_ssize_t size,
6293 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006296 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297}
6298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006299/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006300static void
6301make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006302 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006303 PyObject *unicode,
6304 Py_ssize_t startpos, Py_ssize_t endpos,
6305 const char *reason)
6306{
6307 if (*exceptionObject == NULL) {
6308 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006309 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006310 encoding, unicode, startpos, endpos, reason);
6311 }
6312 else {
6313 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6314 goto onError;
6315 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6316 goto onError;
6317 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6318 goto onError;
6319 return;
6320 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006321 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006322 }
6323}
6324
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006325/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006326static void
6327raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006328 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006329 PyObject *unicode,
6330 Py_ssize_t startpos, Py_ssize_t endpos,
6331 const char *reason)
6332{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006333 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006334 encoding, unicode, startpos, endpos, reason);
6335 if (*exceptionObject != NULL)
6336 PyCodec_StrictErrors(*exceptionObject);
6337}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338
6339/* error handling callback helper:
6340 build arguments, call the callback and check the arguments,
6341 put the result into newpos and return the replacement string, which
6342 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006343static PyObject *
6344unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006345 PyObject **errorHandler,
6346 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006347 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006348 Py_ssize_t startpos, Py_ssize_t endpos,
6349 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006350{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006351 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006352 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006353 PyObject *restuple;
6354 PyObject *resunicode;
6355
6356 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006358 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 }
6361
Benjamin Petersonbac79492012-01-14 13:34:47 -05006362 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006363 return NULL;
6364 len = PyUnicode_GET_LENGTH(unicode);
6365
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006366 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006367 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006368 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006370
6371 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006373 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006376 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 Py_DECREF(restuple);
6378 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006380 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 &resunicode, newpos)) {
6382 Py_DECREF(restuple);
6383 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006385 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6386 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6387 Py_DECREF(restuple);
6388 return NULL;
6389 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006391 *newpos = len + *newpos;
6392 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006393 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 Py_DECREF(restuple);
6395 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006396 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006397 Py_INCREF(resunicode);
6398 Py_DECREF(restuple);
6399 return resunicode;
6400}
6401
Alexander Belopolsky40018472011-02-26 01:02:56 +00006402static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006403unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006404 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006405 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006407 /* input state */
6408 Py_ssize_t pos=0, size;
6409 int kind;
6410 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 /* output object */
6412 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413 /* pointer into the output */
6414 char *str;
6415 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006416 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006417 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6418 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 PyObject *errorHandler = NULL;
6420 PyObject *exc = NULL;
6421 /* the following variable is used for caching string comparisons
6422 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6423 int known_errorHandler = -1;
6424
Benjamin Petersonbac79492012-01-14 13:34:47 -05006425 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006426 return NULL;
6427 size = PyUnicode_GET_LENGTH(unicode);
6428 kind = PyUnicode_KIND(unicode);
6429 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 /* allocate enough for a simple encoding without
6431 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006432 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006433 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006434 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006435 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006436 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006437 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 ressize = size;
6439
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006440 while (pos < size) {
6441 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006442
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 /* can we encode this? */
6444 if (c<limit) {
6445 /* no overflow check, because we know that the space is enough */
6446 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006447 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006448 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 Py_ssize_t requiredsize;
6451 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006452 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006454 Py_ssize_t collstart = pos;
6455 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006457 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 ++collend;
6459 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6460 if (known_errorHandler==-1) {
6461 if ((errors==NULL) || (!strcmp(errors, "strict")))
6462 known_errorHandler = 1;
6463 else if (!strcmp(errors, "replace"))
6464 known_errorHandler = 2;
6465 else if (!strcmp(errors, "ignore"))
6466 known_errorHandler = 3;
6467 else if (!strcmp(errors, "xmlcharrefreplace"))
6468 known_errorHandler = 4;
6469 else
6470 known_errorHandler = 0;
6471 }
6472 switch (known_errorHandler) {
6473 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006474 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 goto onError;
6476 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006477 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 *str++ = '?'; /* fall through */
6479 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006480 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 break;
6482 case 4: /* xmlcharrefreplace */
6483 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006484 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006485 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006486 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006487 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006488 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006489 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006490 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006492 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006494 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006495 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006496 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006497 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006498 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006499 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006500 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006501 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006502 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006503 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006504 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006505 if (requiredsize > PY_SSIZE_T_MAX - incr)
6506 goto overflow;
6507 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006509 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6510 goto overflow;
6511 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006513 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 requiredsize = 2*ressize;
6515 if (_PyBytes_Resize(&res, requiredsize))
6516 goto onError;
6517 str = PyBytes_AS_STRING(res) + respos;
6518 ressize = requiredsize;
6519 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 /* generate replacement */
6521 for (i = collstart; i < collend; ++i) {
6522 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 break;
6526 default:
6527 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528 encoding, reason, unicode, &exc,
6529 collstart, collend, &newpos);
6530 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006531 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006532 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006533 if (PyBytes_Check(repunicode)) {
6534 /* Directly copy bytes result to output. */
6535 repsize = PyBytes_Size(repunicode);
6536 if (repsize > 1) {
6537 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006538 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006539 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6540 Py_DECREF(repunicode);
6541 goto overflow;
6542 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006543 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6544 Py_DECREF(repunicode);
6545 goto onError;
6546 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006547 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006548 ressize += repsize-1;
6549 }
6550 memcpy(str, PyBytes_AsString(repunicode), repsize);
6551 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006552 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006553 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006554 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006555 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 /* need more space? (at least enough for what we
6557 have+the replacement+the rest of the string, so
6558 we won't have to check space for encodable characters) */
6559 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006561 requiredsize = respos;
6562 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6563 goto overflow;
6564 requiredsize += repsize;
6565 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6566 goto overflow;
6567 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006569 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 requiredsize = 2*ressize;
6571 if (_PyBytes_Resize(&res, requiredsize)) {
6572 Py_DECREF(repunicode);
6573 goto onError;
6574 }
6575 str = PyBytes_AS_STRING(res) + respos;
6576 ressize = requiredsize;
6577 }
6578 /* check if there is anything unencodable in the replacement
6579 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006580 for (i = 0; repsize-->0; ++i, ++str) {
6581 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006583 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006584 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 Py_DECREF(repunicode);
6586 goto onError;
6587 }
6588 *str = (char)c;
6589 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006590 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006591 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006592 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006593 }
6594 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006595 /* Resize if we allocated to much */
6596 size = str - PyBytes_AS_STRING(res);
6597 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006598 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006599 if (_PyBytes_Resize(&res, size) < 0)
6600 goto onError;
6601 }
6602
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603 Py_XDECREF(errorHandler);
6604 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006605 return res;
6606
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006607 overflow:
6608 PyErr_SetString(PyExc_OverflowError,
6609 "encoded result is too long for a Python string");
6610
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006611 onError:
6612 Py_XDECREF(res);
6613 Py_XDECREF(errorHandler);
6614 Py_XDECREF(exc);
6615 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616}
6617
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006618/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006619PyObject *
6620PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006621 Py_ssize_t size,
6622 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 PyObject *result;
6625 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6626 if (unicode == NULL)
6627 return NULL;
6628 result = unicode_encode_ucs1(unicode, errors, 256);
6629 Py_DECREF(unicode);
6630 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631}
6632
Alexander Belopolsky40018472011-02-26 01:02:56 +00006633PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006634_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635{
6636 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006637 PyErr_BadArgument();
6638 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006640 if (PyUnicode_READY(unicode) == -1)
6641 return NULL;
6642 /* Fast path: if it is a one-byte string, construct
6643 bytes object directly. */
6644 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6645 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6646 PyUnicode_GET_LENGTH(unicode));
6647 /* Non-Latin-1 characters present. Defer to above function to
6648 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006650}
6651
6652PyObject*
6653PyUnicode_AsLatin1String(PyObject *unicode)
6654{
6655 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656}
6657
6658/* --- 7-bit ASCII Codec -------------------------------------------------- */
6659
Alexander Belopolsky40018472011-02-26 01:02:56 +00006660PyObject *
6661PyUnicode_DecodeASCII(const char *s,
6662 Py_ssize_t size,
6663 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006666 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006667 int kind;
6668 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006669 Py_ssize_t startinpos;
6670 Py_ssize_t endinpos;
6671 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006672 const char *e;
6673 PyObject *errorHandler = NULL;
6674 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006675
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006677 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006678
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006680 if (size == 1 && (unsigned char)s[0] < 128)
6681 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006682
Victor Stinner8f674cc2013-04-17 23:02:17 +02006683 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006684 writer.min_length = size;
6685 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006686 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006687
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006689 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006690 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006691 writer.pos = outpos;
6692 if (writer.pos == size)
6693 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006694
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006695 s += writer.pos;
6696 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006698 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006699 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006700 PyUnicode_WRITE(kind, data, writer.pos, c);
6701 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 ++s;
6703 }
6704 else {
6705 startinpos = s-starts;
6706 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006707 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 errors, &errorHandler,
6709 "ascii", "ordinal not in range(128)",
6710 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006711 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006712 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006713 kind = writer.kind;
6714 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006716 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 Py_XDECREF(errorHandler);
6718 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006719 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006720
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006722 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723 Py_XDECREF(errorHandler);
6724 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725 return NULL;
6726}
6727
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006728/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006729PyObject *
6730PyUnicode_EncodeASCII(const Py_UNICODE *p,
6731 Py_ssize_t size,
6732 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734 PyObject *result;
6735 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6736 if (unicode == NULL)
6737 return NULL;
6738 result = unicode_encode_ucs1(unicode, errors, 128);
6739 Py_DECREF(unicode);
6740 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741}
6742
Alexander Belopolsky40018472011-02-26 01:02:56 +00006743PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006744_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745{
6746 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006747 PyErr_BadArgument();
6748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006750 if (PyUnicode_READY(unicode) == -1)
6751 return NULL;
6752 /* Fast path: if it is an ASCII-only string, construct bytes object
6753 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006754 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006755 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6756 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006757 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006758}
6759
6760PyObject *
6761PyUnicode_AsASCIIString(PyObject *unicode)
6762{
6763 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764}
6765
Victor Stinner99b95382011-07-04 14:23:54 +02006766#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006767
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006768/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006769
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006770#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771#define NEED_RETRY
6772#endif
6773
Victor Stinner3a50e702011-10-18 21:21:00 +02006774#ifndef WC_ERR_INVALID_CHARS
6775# define WC_ERR_INVALID_CHARS 0x0080
6776#endif
6777
6778static char*
6779code_page_name(UINT code_page, PyObject **obj)
6780{
6781 *obj = NULL;
6782 if (code_page == CP_ACP)
6783 return "mbcs";
6784 if (code_page == CP_UTF7)
6785 return "CP_UTF7";
6786 if (code_page == CP_UTF8)
6787 return "CP_UTF8";
6788
6789 *obj = PyBytes_FromFormat("cp%u", code_page);
6790 if (*obj == NULL)
6791 return NULL;
6792 return PyBytes_AS_STRING(*obj);
6793}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006794
Victor Stinner3a50e702011-10-18 21:21:00 +02006795static DWORD
6796decode_code_page_flags(UINT code_page)
6797{
6798 if (code_page == CP_UTF7) {
6799 /* The CP_UTF7 decoder only supports flags=0 */
6800 return 0;
6801 }
6802 else
6803 return MB_ERR_INVALID_CHARS;
6804}
6805
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006807 * Decode a byte string from a Windows code page into unicode object in strict
6808 * mode.
6809 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006810 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6811 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006812 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006813static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006814decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006815 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006816 const char *in,
6817 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006818{
Victor Stinner3a50e702011-10-18 21:21:00 +02006819 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006820 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006821 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822
6823 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006824 assert(insize > 0);
6825 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6826 if (outsize <= 0)
6827 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006828
6829 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006831 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006832 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 if (*v == NULL)
6834 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006835 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006836 }
6837 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006839 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006840 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006842 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843 }
6844
6845 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006846 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6847 if (outsize <= 0)
6848 goto error;
6849 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006850
Victor Stinner3a50e702011-10-18 21:21:00 +02006851error:
6852 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6853 return -2;
6854 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006855 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006856}
6857
Victor Stinner3a50e702011-10-18 21:21:00 +02006858/*
6859 * Decode a byte string from a code page into unicode object with an error
6860 * handler.
6861 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006862 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006863 * UnicodeDecodeError exception and returns -1 on error.
6864 */
6865static int
6866decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006867 PyObject **v,
6868 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006869 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006870{
6871 const char *startin = in;
6872 const char *endin = in + size;
6873 const DWORD flags = decode_code_page_flags(code_page);
6874 /* Ideally, we should get reason from FormatMessage. This is the Windows
6875 2000 English version of the message. */
6876 const char *reason = "No mapping for the Unicode character exists "
6877 "in the target code page.";
6878 /* each step cannot decode more than 1 character, but a character can be
6879 represented as a surrogate pair */
6880 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006881 int insize;
6882 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 PyObject *errorHandler = NULL;
6884 PyObject *exc = NULL;
6885 PyObject *encoding_obj = NULL;
6886 char *encoding;
6887 DWORD err;
6888 int ret = -1;
6889
6890 assert(size > 0);
6891
6892 encoding = code_page_name(code_page, &encoding_obj);
6893 if (encoding == NULL)
6894 return -1;
6895
Victor Stinner7d00cc12014-03-17 23:08:06 +01006896 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006897 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6898 UnicodeDecodeError. */
6899 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6900 if (exc != NULL) {
6901 PyCodec_StrictErrors(exc);
6902 Py_CLEAR(exc);
6903 }
6904 goto error;
6905 }
6906
6907 if (*v == NULL) {
6908 /* Create unicode object */
6909 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6910 PyErr_NoMemory();
6911 goto error;
6912 }
Victor Stinnerab595942011-12-17 04:59:06 +01006913 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006914 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006915 if (*v == NULL)
6916 goto error;
6917 startout = PyUnicode_AS_UNICODE(*v);
6918 }
6919 else {
6920 /* Extend unicode object */
6921 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6922 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6923 PyErr_NoMemory();
6924 goto error;
6925 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006926 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006927 goto error;
6928 startout = PyUnicode_AS_UNICODE(*v) + n;
6929 }
6930
6931 /* Decode the byte string character per character */
6932 out = startout;
6933 while (in < endin)
6934 {
6935 /* Decode a character */
6936 insize = 1;
6937 do
6938 {
6939 outsize = MultiByteToWideChar(code_page, flags,
6940 in, insize,
6941 buffer, Py_ARRAY_LENGTH(buffer));
6942 if (outsize > 0)
6943 break;
6944 err = GetLastError();
6945 if (err != ERROR_NO_UNICODE_TRANSLATION
6946 && err != ERROR_INSUFFICIENT_BUFFER)
6947 {
6948 PyErr_SetFromWindowsErr(0);
6949 goto error;
6950 }
6951 insize++;
6952 }
6953 /* 4=maximum length of a UTF-8 sequence */
6954 while (insize <= 4 && (in + insize) <= endin);
6955
6956 if (outsize <= 0) {
6957 Py_ssize_t startinpos, endinpos, outpos;
6958
Victor Stinner7d00cc12014-03-17 23:08:06 +01006959 /* last character in partial decode? */
6960 if (in + insize >= endin && !final)
6961 break;
6962
Victor Stinner3a50e702011-10-18 21:21:00 +02006963 startinpos = in - startin;
6964 endinpos = startinpos + 1;
6965 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006966 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006967 errors, &errorHandler,
6968 encoding, reason,
6969 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006970 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 {
6972 goto error;
6973 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006974 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 }
6976 else {
6977 in += insize;
6978 memcpy(out, buffer, outsize * sizeof(wchar_t));
6979 out += outsize;
6980 }
6981 }
6982
6983 /* write a NUL character at the end */
6984 *out = 0;
6985
6986 /* Extend unicode object */
6987 outsize = out - startout;
6988 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006989 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006990 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02006991 /* (in - startin) <= size and size is an int */
6992 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02006993
6994error:
6995 Py_XDECREF(encoding_obj);
6996 Py_XDECREF(errorHandler);
6997 Py_XDECREF(exc);
6998 return ret;
6999}
7000
Victor Stinner3a50e702011-10-18 21:21:00 +02007001static PyObject *
7002decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007003 const char *s, Py_ssize_t size,
7004 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007005{
Victor Stinner76a31a62011-11-04 00:05:13 +01007006 PyObject *v = NULL;
7007 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007008
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 if (code_page < 0) {
7010 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7011 return NULL;
7012 }
7013
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007014 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007016
Victor Stinner76a31a62011-11-04 00:05:13 +01007017 do
7018 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007019#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007020 if (size > INT_MAX) {
7021 chunk_size = INT_MAX;
7022 final = 0;
7023 done = 0;
7024 }
7025 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007027 {
7028 chunk_size = (int)size;
7029 final = (consumed == NULL);
7030 done = 1;
7031 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032
Victor Stinner76a31a62011-11-04 00:05:13 +01007033 if (chunk_size == 0 && done) {
7034 if (v != NULL)
7035 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007036 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007037 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038
Victor Stinner76a31a62011-11-04 00:05:13 +01007039 converted = decode_code_page_strict(code_page, &v,
7040 s, chunk_size);
7041 if (converted == -2)
7042 converted = decode_code_page_errors(code_page, &v,
7043 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007044 errors, final);
7045 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007046
7047 if (converted < 0) {
7048 Py_XDECREF(v);
7049 return NULL;
7050 }
7051
7052 if (consumed)
7053 *consumed += converted;
7054
7055 s += converted;
7056 size -= converted;
7057 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007058
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007059 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007060}
7061
Alexander Belopolsky40018472011-02-26 01:02:56 +00007062PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007063PyUnicode_DecodeCodePageStateful(int code_page,
7064 const char *s,
7065 Py_ssize_t size,
7066 const char *errors,
7067 Py_ssize_t *consumed)
7068{
7069 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7070}
7071
7072PyObject *
7073PyUnicode_DecodeMBCSStateful(const char *s,
7074 Py_ssize_t size,
7075 const char *errors,
7076 Py_ssize_t *consumed)
7077{
7078 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7079}
7080
7081PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007082PyUnicode_DecodeMBCS(const char *s,
7083 Py_ssize_t size,
7084 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007085{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7087}
7088
Victor Stinner3a50e702011-10-18 21:21:00 +02007089static DWORD
7090encode_code_page_flags(UINT code_page, const char *errors)
7091{
7092 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007093 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007094 }
7095 else if (code_page == CP_UTF7) {
7096 /* CP_UTF7 only supports flags=0 */
7097 return 0;
7098 }
7099 else {
7100 if (errors != NULL && strcmp(errors, "replace") == 0)
7101 return 0;
7102 else
7103 return WC_NO_BEST_FIT_CHARS;
7104 }
7105}
7106
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007108 * Encode a Unicode string to a Windows code page into a byte string in strict
7109 * mode.
7110 *
7111 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007112 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007114static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007115encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007116 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118{
Victor Stinner554f3f02010-06-16 23:33:54 +00007119 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 BOOL *pusedDefaultChar = &usedDefaultChar;
7121 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007122 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007123 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007124 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007125 const DWORD flags = encode_code_page_flags(code_page, NULL);
7126 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007127 /* Create a substring so that we can get the UTF-16 representation
7128 of just the slice under consideration. */
7129 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130
Martin v. Löwis3d325192011-11-04 18:23:06 +01007131 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007132
Victor Stinner3a50e702011-10-18 21:21:00 +02007133 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007134 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007136 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007137
Victor Stinner2fc507f2011-11-04 20:06:39 +01007138 substring = PyUnicode_Substring(unicode, offset, offset+len);
7139 if (substring == NULL)
7140 return -1;
7141 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7142 if (p == NULL) {
7143 Py_DECREF(substring);
7144 return -1;
7145 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007146 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007147
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007148 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007149 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007150 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007151 NULL, 0,
7152 NULL, pusedDefaultChar);
7153 if (outsize <= 0)
7154 goto error;
7155 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007156 if (pusedDefaultChar && *pusedDefaultChar) {
7157 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007158 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007159 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007160
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007162 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007164 if (*outbytes == NULL) {
7165 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007166 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007167 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007169 }
7170 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007171 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 const Py_ssize_t n = PyBytes_Size(*outbytes);
7173 if (outsize > PY_SSIZE_T_MAX - n) {
7174 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007175 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007176 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007178 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7179 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007181 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183 }
7184
7185 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007187 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007188 out, outsize,
7189 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007190 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 if (outsize <= 0)
7192 goto error;
7193 if (pusedDefaultChar && *pusedDefaultChar)
7194 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007195 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007196
Victor Stinner3a50e702011-10-18 21:21:00 +02007197error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007198 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7200 return -2;
7201 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007202 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007203}
7204
Victor Stinner3a50e702011-10-18 21:21:00 +02007205/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007206 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 * error handler.
7208 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007209 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 * -1 on other error.
7211 */
7212static int
7213encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007214 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007215 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007216{
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007218 Py_ssize_t pos = unicode_offset;
7219 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 /* Ideally, we should get reason from FormatMessage. This is the Windows
7221 2000 English version of the message. */
7222 const char *reason = "invalid character";
7223 /* 4=maximum length of a UTF-8 sequence */
7224 char buffer[4];
7225 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7226 Py_ssize_t outsize;
7227 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 PyObject *errorHandler = NULL;
7229 PyObject *exc = NULL;
7230 PyObject *encoding_obj = NULL;
7231 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007232 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 PyObject *rep;
7234 int ret = -1;
7235
7236 assert(insize > 0);
7237
7238 encoding = code_page_name(code_page, &encoding_obj);
7239 if (encoding == NULL)
7240 return -1;
7241
7242 if (errors == NULL || strcmp(errors, "strict") == 0) {
7243 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7244 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007245 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 if (exc != NULL) {
7247 PyCodec_StrictErrors(exc);
7248 Py_DECREF(exc);
7249 }
7250 Py_XDECREF(encoding_obj);
7251 return -1;
7252 }
7253
7254 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7255 pusedDefaultChar = &usedDefaultChar;
7256 else
7257 pusedDefaultChar = NULL;
7258
7259 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7260 PyErr_NoMemory();
7261 goto error;
7262 }
7263 outsize = insize * Py_ARRAY_LENGTH(buffer);
7264
7265 if (*outbytes == NULL) {
7266 /* Create string object */
7267 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7268 if (*outbytes == NULL)
7269 goto error;
7270 out = PyBytes_AS_STRING(*outbytes);
7271 }
7272 else {
7273 /* Extend string object */
7274 Py_ssize_t n = PyBytes_Size(*outbytes);
7275 if (n > PY_SSIZE_T_MAX - outsize) {
7276 PyErr_NoMemory();
7277 goto error;
7278 }
7279 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7280 goto error;
7281 out = PyBytes_AS_STRING(*outbytes) + n;
7282 }
7283
7284 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007285 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007287 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7288 wchar_t chars[2];
7289 int charsize;
7290 if (ch < 0x10000) {
7291 chars[0] = (wchar_t)ch;
7292 charsize = 1;
7293 }
7294 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007295 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7296 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007297 charsize = 2;
7298 }
7299
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007301 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 buffer, Py_ARRAY_LENGTH(buffer),
7303 NULL, pusedDefaultChar);
7304 if (outsize > 0) {
7305 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7306 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007307 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 memcpy(out, buffer, outsize);
7309 out += outsize;
7310 continue;
7311 }
7312 }
7313 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7314 PyErr_SetFromWindowsErr(0);
7315 goto error;
7316 }
7317
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 rep = unicode_encode_call_errorhandler(
7319 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007320 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007321 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007322 if (rep == NULL)
7323 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007324 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007325
7326 if (PyBytes_Check(rep)) {
7327 outsize = PyBytes_GET_SIZE(rep);
7328 if (outsize != 1) {
7329 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7330 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7331 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7332 Py_DECREF(rep);
7333 goto error;
7334 }
7335 out = PyBytes_AS_STRING(*outbytes) + offset;
7336 }
7337 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7338 out += outsize;
7339 }
7340 else {
7341 Py_ssize_t i;
7342 enum PyUnicode_Kind kind;
7343 void *data;
7344
Benjamin Petersonbac79492012-01-14 13:34:47 -05007345 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007346 Py_DECREF(rep);
7347 goto error;
7348 }
7349
7350 outsize = PyUnicode_GET_LENGTH(rep);
7351 if (outsize != 1) {
7352 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7353 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7354 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7355 Py_DECREF(rep);
7356 goto error;
7357 }
7358 out = PyBytes_AS_STRING(*outbytes) + offset;
7359 }
7360 kind = PyUnicode_KIND(rep);
7361 data = PyUnicode_DATA(rep);
7362 for (i=0; i < outsize; i++) {
7363 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7364 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007365 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007366 encoding, unicode,
7367 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007368 "unable to encode error handler result to ASCII");
7369 Py_DECREF(rep);
7370 goto error;
7371 }
7372 *out = (unsigned char)ch;
7373 out++;
7374 }
7375 }
7376 Py_DECREF(rep);
7377 }
7378 /* write a NUL byte */
7379 *out = 0;
7380 outsize = out - PyBytes_AS_STRING(*outbytes);
7381 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7382 if (_PyBytes_Resize(outbytes, outsize) < 0)
7383 goto error;
7384 ret = 0;
7385
7386error:
7387 Py_XDECREF(encoding_obj);
7388 Py_XDECREF(errorHandler);
7389 Py_XDECREF(exc);
7390 return ret;
7391}
7392
Victor Stinner3a50e702011-10-18 21:21:00 +02007393static PyObject *
7394encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007395 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 const char *errors)
7397{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007398 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007400 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007401 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007402
Victor Stinner29dacf22015-01-26 16:41:32 +01007403 if (!PyUnicode_Check(unicode)) {
7404 PyErr_BadArgument();
7405 return NULL;
7406 }
7407
Benjamin Petersonbac79492012-01-14 13:34:47 -05007408 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007409 return NULL;
7410 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007411
Victor Stinner3a50e702011-10-18 21:21:00 +02007412 if (code_page < 0) {
7413 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7414 return NULL;
7415 }
7416
Martin v. Löwis3d325192011-11-04 18:23:06 +01007417 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007418 return PyBytes_FromStringAndSize(NULL, 0);
7419
Victor Stinner7581cef2011-11-03 22:32:33 +01007420 offset = 0;
7421 do
7422 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007423#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007424 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007425 chunks. */
7426 if (len > INT_MAX/2) {
7427 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007428 done = 0;
7429 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007430 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007431#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007432 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007433 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007434 done = 1;
7435 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007436
Victor Stinner76a31a62011-11-04 00:05:13 +01007437 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007438 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007439 errors);
7440 if (ret == -2)
7441 ret = encode_code_page_errors(code_page, &outbytes,
7442 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007443 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007444 if (ret < 0) {
7445 Py_XDECREF(outbytes);
7446 return NULL;
7447 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448
Victor Stinner7581cef2011-11-03 22:32:33 +01007449 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007450 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007451 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 return outbytes;
7454}
7455
7456PyObject *
7457PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7458 Py_ssize_t size,
7459 const char *errors)
7460{
Victor Stinner7581cef2011-11-03 22:32:33 +01007461 PyObject *unicode, *res;
7462 unicode = PyUnicode_FromUnicode(p, size);
7463 if (unicode == NULL)
7464 return NULL;
7465 res = encode_code_page(CP_ACP, unicode, errors);
7466 Py_DECREF(unicode);
7467 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007468}
7469
7470PyObject *
7471PyUnicode_EncodeCodePage(int code_page,
7472 PyObject *unicode,
7473 const char *errors)
7474{
Victor Stinner7581cef2011-11-03 22:32:33 +01007475 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007476}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007477
Alexander Belopolsky40018472011-02-26 01:02:56 +00007478PyObject *
7479PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007480{
Victor Stinner7581cef2011-11-03 22:32:33 +01007481 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007482}
7483
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007484#undef NEED_RETRY
7485
Victor Stinner99b95382011-07-04 14:23:54 +02007486#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007487
Guido van Rossumd57fd912000-03-10 22:53:23 +00007488/* --- Character Mapping Codec -------------------------------------------- */
7489
Victor Stinnerfb161b12013-04-18 01:44:27 +02007490static int
7491charmap_decode_string(const char *s,
7492 Py_ssize_t size,
7493 PyObject *mapping,
7494 const char *errors,
7495 _PyUnicodeWriter *writer)
7496{
7497 const char *starts = s;
7498 const char *e;
7499 Py_ssize_t startinpos, endinpos;
7500 PyObject *errorHandler = NULL, *exc = NULL;
7501 Py_ssize_t maplen;
7502 enum PyUnicode_Kind mapkind;
7503 void *mapdata;
7504 Py_UCS4 x;
7505 unsigned char ch;
7506
7507 if (PyUnicode_READY(mapping) == -1)
7508 return -1;
7509
7510 maplen = PyUnicode_GET_LENGTH(mapping);
7511 mapdata = PyUnicode_DATA(mapping);
7512 mapkind = PyUnicode_KIND(mapping);
7513
7514 e = s + size;
7515
7516 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7517 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7518 * is disabled in encoding aliases, latin1 is preferred because
7519 * its implementation is faster. */
7520 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7521 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7522 Py_UCS4 maxchar = writer->maxchar;
7523
7524 assert (writer->kind == PyUnicode_1BYTE_KIND);
7525 while (s < e) {
7526 ch = *s;
7527 x = mapdata_ucs1[ch];
7528 if (x > maxchar) {
7529 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7530 goto onError;
7531 maxchar = writer->maxchar;
7532 outdata = (Py_UCS1 *)writer->data;
7533 }
7534 outdata[writer->pos] = x;
7535 writer->pos++;
7536 ++s;
7537 }
7538 return 0;
7539 }
7540
7541 while (s < e) {
7542 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7543 enum PyUnicode_Kind outkind = writer->kind;
7544 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7545 if (outkind == PyUnicode_1BYTE_KIND) {
7546 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7547 Py_UCS4 maxchar = writer->maxchar;
7548 while (s < e) {
7549 ch = *s;
7550 x = mapdata_ucs2[ch];
7551 if (x > maxchar)
7552 goto Error;
7553 outdata[writer->pos] = x;
7554 writer->pos++;
7555 ++s;
7556 }
7557 break;
7558 }
7559 else if (outkind == PyUnicode_2BYTE_KIND) {
7560 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7561 while (s < e) {
7562 ch = *s;
7563 x = mapdata_ucs2[ch];
7564 if (x == 0xFFFE)
7565 goto Error;
7566 outdata[writer->pos] = x;
7567 writer->pos++;
7568 ++s;
7569 }
7570 break;
7571 }
7572 }
7573 ch = *s;
7574
7575 if (ch < maplen)
7576 x = PyUnicode_READ(mapkind, mapdata, ch);
7577 else
7578 x = 0xfffe; /* invalid value */
7579Error:
7580 if (x == 0xfffe)
7581 {
7582 /* undefined mapping */
7583 startinpos = s-starts;
7584 endinpos = startinpos+1;
7585 if (unicode_decode_call_errorhandler_writer(
7586 errors, &errorHandler,
7587 "charmap", "character maps to <undefined>",
7588 &starts, &e, &startinpos, &endinpos, &exc, &s,
7589 writer)) {
7590 goto onError;
7591 }
7592 continue;
7593 }
7594
7595 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7596 goto onError;
7597 ++s;
7598 }
7599 Py_XDECREF(errorHandler);
7600 Py_XDECREF(exc);
7601 return 0;
7602
7603onError:
7604 Py_XDECREF(errorHandler);
7605 Py_XDECREF(exc);
7606 return -1;
7607}
7608
7609static int
7610charmap_decode_mapping(const char *s,
7611 Py_ssize_t size,
7612 PyObject *mapping,
7613 const char *errors,
7614 _PyUnicodeWriter *writer)
7615{
7616 const char *starts = s;
7617 const char *e;
7618 Py_ssize_t startinpos, endinpos;
7619 PyObject *errorHandler = NULL, *exc = NULL;
7620 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007621 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007622
7623 e = s + size;
7624
7625 while (s < e) {
7626 ch = *s;
7627
7628 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7629 key = PyLong_FromLong((long)ch);
7630 if (key == NULL)
7631 goto onError;
7632
7633 item = PyObject_GetItem(mapping, key);
7634 Py_DECREF(key);
7635 if (item == NULL) {
7636 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7637 /* No mapping found means: mapping is undefined. */
7638 PyErr_Clear();
7639 goto Undefined;
7640 } else
7641 goto onError;
7642 }
7643
7644 /* Apply mapping */
7645 if (item == Py_None)
7646 goto Undefined;
7647 if (PyLong_Check(item)) {
7648 long value = PyLong_AS_LONG(item);
7649 if (value == 0xFFFE)
7650 goto Undefined;
7651 if (value < 0 || value > MAX_UNICODE) {
7652 PyErr_Format(PyExc_TypeError,
7653 "character mapping must be in range(0x%lx)",
7654 (unsigned long)MAX_UNICODE + 1);
7655 goto onError;
7656 }
7657
7658 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7659 goto onError;
7660 }
7661 else if (PyUnicode_Check(item)) {
7662 if (PyUnicode_READY(item) == -1)
7663 goto onError;
7664 if (PyUnicode_GET_LENGTH(item) == 1) {
7665 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7666 if (value == 0xFFFE)
7667 goto Undefined;
7668 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7669 goto onError;
7670 }
7671 else {
7672 writer->overallocate = 1;
7673 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7674 goto onError;
7675 }
7676 }
7677 else {
7678 /* wrong return value */
7679 PyErr_SetString(PyExc_TypeError,
7680 "character mapping must return integer, None or str");
7681 goto onError;
7682 }
7683 Py_CLEAR(item);
7684 ++s;
7685 continue;
7686
7687Undefined:
7688 /* undefined mapping */
7689 Py_CLEAR(item);
7690 startinpos = s-starts;
7691 endinpos = startinpos+1;
7692 if (unicode_decode_call_errorhandler_writer(
7693 errors, &errorHandler,
7694 "charmap", "character maps to <undefined>",
7695 &starts, &e, &startinpos, &endinpos, &exc, &s,
7696 writer)) {
7697 goto onError;
7698 }
7699 }
7700 Py_XDECREF(errorHandler);
7701 Py_XDECREF(exc);
7702 return 0;
7703
7704onError:
7705 Py_XDECREF(item);
7706 Py_XDECREF(errorHandler);
7707 Py_XDECREF(exc);
7708 return -1;
7709}
7710
Alexander Belopolsky40018472011-02-26 01:02:56 +00007711PyObject *
7712PyUnicode_DecodeCharmap(const char *s,
7713 Py_ssize_t size,
7714 PyObject *mapping,
7715 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007717 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007718
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 /* Default to Latin-1 */
7720 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007721 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722
Guido van Rossumd57fd912000-03-10 22:53:23 +00007723 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007724 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007725 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007726 writer.min_length = size;
7727 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007728 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007729
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007730 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007731 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7732 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007733 }
7734 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007735 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7736 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007737 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007738 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007739
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007741 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 return NULL;
7743}
7744
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007745/* Charmap encoding: the lookup table */
7746
Alexander Belopolsky40018472011-02-26 01:02:56 +00007747struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007748 PyObject_HEAD
7749 unsigned char level1[32];
7750 int count2, count3;
7751 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007752};
7753
7754static PyObject*
7755encoding_map_size(PyObject *obj, PyObject* args)
7756{
7757 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007758 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007760}
7761
7762static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007763 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 PyDoc_STR("Return the size (in bytes) of this object") },
7765 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766};
7767
7768static void
7769encoding_map_dealloc(PyObject* o)
7770{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007772}
7773
7774static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007775 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 "EncodingMap", /*tp_name*/
7777 sizeof(struct encoding_map), /*tp_basicsize*/
7778 0, /*tp_itemsize*/
7779 /* methods */
7780 encoding_map_dealloc, /*tp_dealloc*/
7781 0, /*tp_print*/
7782 0, /*tp_getattr*/
7783 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007784 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007785 0, /*tp_repr*/
7786 0, /*tp_as_number*/
7787 0, /*tp_as_sequence*/
7788 0, /*tp_as_mapping*/
7789 0, /*tp_hash*/
7790 0, /*tp_call*/
7791 0, /*tp_str*/
7792 0, /*tp_getattro*/
7793 0, /*tp_setattro*/
7794 0, /*tp_as_buffer*/
7795 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7796 0, /*tp_doc*/
7797 0, /*tp_traverse*/
7798 0, /*tp_clear*/
7799 0, /*tp_richcompare*/
7800 0, /*tp_weaklistoffset*/
7801 0, /*tp_iter*/
7802 0, /*tp_iternext*/
7803 encoding_map_methods, /*tp_methods*/
7804 0, /*tp_members*/
7805 0, /*tp_getset*/
7806 0, /*tp_base*/
7807 0, /*tp_dict*/
7808 0, /*tp_descr_get*/
7809 0, /*tp_descr_set*/
7810 0, /*tp_dictoffset*/
7811 0, /*tp_init*/
7812 0, /*tp_alloc*/
7813 0, /*tp_new*/
7814 0, /*tp_free*/
7815 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007816};
7817
7818PyObject*
7819PyUnicode_BuildEncodingMap(PyObject* string)
7820{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007821 PyObject *result;
7822 struct encoding_map *mresult;
7823 int i;
7824 int need_dict = 0;
7825 unsigned char level1[32];
7826 unsigned char level2[512];
7827 unsigned char *mlevel1, *mlevel2, *mlevel3;
7828 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007829 int kind;
7830 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007831 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007832 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007833
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007834 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007835 PyErr_BadArgument();
7836 return NULL;
7837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007838 kind = PyUnicode_KIND(string);
7839 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007840 length = PyUnicode_GET_LENGTH(string);
7841 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007842 memset(level1, 0xFF, sizeof level1);
7843 memset(level2, 0xFF, sizeof level2);
7844
7845 /* If there isn't a one-to-one mapping of NULL to \0,
7846 or if there are non-BMP characters, we need to use
7847 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007849 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007850 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007851 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007852 ch = PyUnicode_READ(kind, data, i);
7853 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007854 need_dict = 1;
7855 break;
7856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007857 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007858 /* unmapped character */
7859 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 l1 = ch >> 11;
7861 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007862 if (level1[l1] == 0xFF)
7863 level1[l1] = count2++;
7864 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007865 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007866 }
7867
7868 if (count2 >= 0xFF || count3 >= 0xFF)
7869 need_dict = 1;
7870
7871 if (need_dict) {
7872 PyObject *result = PyDict_New();
7873 PyObject *key, *value;
7874 if (!result)
7875 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007876 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007877 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007878 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007879 if (!key || !value)
7880 goto failed1;
7881 if (PyDict_SetItem(result, key, value) == -1)
7882 goto failed1;
7883 Py_DECREF(key);
7884 Py_DECREF(value);
7885 }
7886 return result;
7887 failed1:
7888 Py_XDECREF(key);
7889 Py_XDECREF(value);
7890 Py_DECREF(result);
7891 return NULL;
7892 }
7893
7894 /* Create a three-level trie */
7895 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7896 16*count2 + 128*count3 - 1);
7897 if (!result)
7898 return PyErr_NoMemory();
7899 PyObject_Init(result, &EncodingMapType);
7900 mresult = (struct encoding_map*)result;
7901 mresult->count2 = count2;
7902 mresult->count3 = count3;
7903 mlevel1 = mresult->level1;
7904 mlevel2 = mresult->level23;
7905 mlevel3 = mresult->level23 + 16*count2;
7906 memcpy(mlevel1, level1, 32);
7907 memset(mlevel2, 0xFF, 16*count2);
7908 memset(mlevel3, 0, 128*count3);
7909 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007910 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007911 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007912 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7913 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007914 /* unmapped character */
7915 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007916 o1 = ch>>11;
7917 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007918 i2 = 16*mlevel1[o1] + o2;
7919 if (mlevel2[i2] == 0xFF)
7920 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007921 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922 i3 = 128*mlevel2[i2] + o3;
7923 mlevel3[i3] = i;
7924 }
7925 return result;
7926}
7927
7928static int
Victor Stinner22168992011-11-20 17:09:18 +01007929encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007930{
7931 struct encoding_map *map = (struct encoding_map*)mapping;
7932 int l1 = c>>11;
7933 int l2 = (c>>7) & 0xF;
7934 int l3 = c & 0x7F;
7935 int i;
7936
Victor Stinner22168992011-11-20 17:09:18 +01007937 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007939 if (c == 0)
7940 return 0;
7941 /* level 1*/
7942 i = map->level1[l1];
7943 if (i == 0xFF) {
7944 return -1;
7945 }
7946 /* level 2*/
7947 i = map->level23[16*i+l2];
7948 if (i == 0xFF) {
7949 return -1;
7950 }
7951 /* level 3 */
7952 i = map->level23[16*map->count2 + 128*i + l3];
7953 if (i == 0) {
7954 return -1;
7955 }
7956 return i;
7957}
7958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007959/* Lookup the character ch in the mapping. If the character
7960 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007961 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007962static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007963charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964{
Christian Heimes217cfd12007-12-02 14:31:20 +00007965 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007966 PyObject *x;
7967
7968 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007970 x = PyObject_GetItem(mapping, w);
7971 Py_DECREF(w);
7972 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7974 /* No mapping found means: mapping is undefined. */
7975 PyErr_Clear();
7976 x = Py_None;
7977 Py_INCREF(x);
7978 return x;
7979 } else
7980 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007982 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007984 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 long value = PyLong_AS_LONG(x);
7986 if (value < 0 || value > 255) {
7987 PyErr_SetString(PyExc_TypeError,
7988 "character mapping must be in range(256)");
7989 Py_DECREF(x);
7990 return NULL;
7991 }
7992 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007993 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007994 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007997 /* wrong return value */
7998 PyErr_Format(PyExc_TypeError,
7999 "character mapping must return integer, bytes or None, not %.400s",
8000 x->ob_type->tp_name);
8001 Py_DECREF(x);
8002 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 }
8004}
8005
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008006static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008007charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008008{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008009 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8010 /* exponentially overallocate to minimize reallocations */
8011 if (requiredsize < 2*outsize)
8012 requiredsize = 2*outsize;
8013 if (_PyBytes_Resize(outobj, requiredsize))
8014 return -1;
8015 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008016}
8017
Benjamin Peterson14339b62009-01-31 16:36:08 +00008018typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008020} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008022 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008023 space is available. Return a new reference to the object that
8024 was put in the output buffer, or Py_None, if the mapping was undefined
8025 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008026 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008027static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008028charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008029 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008031 PyObject *rep;
8032 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008033 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008034
Christian Heimes90aa7642007-12-19 02:45:37 +00008035 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008036 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038 if (res == -1)
8039 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 if (outsize<requiredsize)
8041 if (charmapencode_resize(outobj, outpos, requiredsize))
8042 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008043 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 outstart[(*outpos)++] = (char)res;
8045 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008046 }
8047
8048 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008049 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008051 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 Py_DECREF(rep);
8053 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008054 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 if (PyLong_Check(rep)) {
8056 Py_ssize_t requiredsize = *outpos+1;
8057 if (outsize<requiredsize)
8058 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8059 Py_DECREF(rep);
8060 return enc_EXCEPTION;
8061 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008062 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008064 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 else {
8066 const char *repchars = PyBytes_AS_STRING(rep);
8067 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8068 Py_ssize_t requiredsize = *outpos+repsize;
8069 if (outsize<requiredsize)
8070 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8071 Py_DECREF(rep);
8072 return enc_EXCEPTION;
8073 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008074 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 memcpy(outstart + *outpos, repchars, repsize);
8076 *outpos += repsize;
8077 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079 Py_DECREF(rep);
8080 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081}
8082
8083/* handle an error in PyUnicode_EncodeCharmap
8084 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008085static int
8086charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008087 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008089 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008090 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091{
8092 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008093 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008094 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008095 enum PyUnicode_Kind kind;
8096 void *data;
8097 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008099 Py_ssize_t collstartpos = *inpos;
8100 Py_ssize_t collendpos = *inpos+1;
8101 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102 char *encoding = "charmap";
8103 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008104 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008105 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008106 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008107
Benjamin Petersonbac79492012-01-14 13:34:47 -05008108 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008109 return -1;
8110 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111 /* find all unencodable characters */
8112 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008114 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008115 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008116 val = encoding_map_lookup(ch, mapping);
8117 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 break;
8119 ++collendpos;
8120 continue;
8121 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008122
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008123 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8124 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 if (rep==NULL)
8126 return -1;
8127 else if (rep!=Py_None) {
8128 Py_DECREF(rep);
8129 break;
8130 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008131 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133 }
8134 /* cache callback name lookup
8135 * (if not done yet, i.e. it's the first error) */
8136 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 if ((errors==NULL) || (!strcmp(errors, "strict")))
8138 *known_errorHandler = 1;
8139 else if (!strcmp(errors, "replace"))
8140 *known_errorHandler = 2;
8141 else if (!strcmp(errors, "ignore"))
8142 *known_errorHandler = 3;
8143 else if (!strcmp(errors, "xmlcharrefreplace"))
8144 *known_errorHandler = 4;
8145 else
8146 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008147 }
8148 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008149 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008150 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008151 return -1;
8152 case 2: /* replace */
8153 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 x = charmapencode_output('?', mapping, res, respos);
8155 if (x==enc_EXCEPTION) {
8156 return -1;
8157 }
8158 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008159 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008160 return -1;
8161 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008162 }
8163 /* fall through */
8164 case 3: /* ignore */
8165 *inpos = collendpos;
8166 break;
8167 case 4: /* xmlcharrefreplace */
8168 /* generate replacement (temporarily (mis)uses p) */
8169 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 char buffer[2+29+1+1];
8171 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008172 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 for (cp = buffer; *cp; ++cp) {
8174 x = charmapencode_output(*cp, mapping, res, respos);
8175 if (x==enc_EXCEPTION)
8176 return -1;
8177 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008178 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 return -1;
8180 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008181 }
8182 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 *inpos = collendpos;
8184 break;
8185 default:
8186 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008187 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008189 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008191 if (PyBytes_Check(repunicode)) {
8192 /* Directly copy bytes result to output. */
8193 Py_ssize_t outsize = PyBytes_Size(*res);
8194 Py_ssize_t requiredsize;
8195 repsize = PyBytes_Size(repunicode);
8196 requiredsize = *respos + repsize;
8197 if (requiredsize > outsize)
8198 /* Make room for all additional bytes. */
8199 if (charmapencode_resize(res, respos, requiredsize)) {
8200 Py_DECREF(repunicode);
8201 return -1;
8202 }
8203 memcpy(PyBytes_AsString(*res) + *respos,
8204 PyBytes_AsString(repunicode), repsize);
8205 *respos += repsize;
8206 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008207 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008208 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008209 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008211 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008212 Py_DECREF(repunicode);
8213 return -1;
8214 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008215 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008216 data = PyUnicode_DATA(repunicode);
8217 kind = PyUnicode_KIND(repunicode);
8218 for (index = 0; index < repsize; index++) {
8219 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8220 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008222 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 return -1;
8224 }
8225 else if (x==enc_FAILED) {
8226 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008227 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 return -1;
8229 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008230 }
8231 *inpos = newpos;
8232 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008233 }
8234 return 0;
8235}
8236
Alexander Belopolsky40018472011-02-26 01:02:56 +00008237PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008238_PyUnicode_EncodeCharmap(PyObject *unicode,
8239 PyObject *mapping,
8240 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008241{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242 /* output object */
8243 PyObject *res = NULL;
8244 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008245 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008246 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008248 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 PyObject *errorHandler = NULL;
8250 PyObject *exc = NULL;
8251 /* the following variable is used for caching string comparisons
8252 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8253 * 3=ignore, 4=xmlcharrefreplace */
8254 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008255 void *data;
8256 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008257
Benjamin Petersonbac79492012-01-14 13:34:47 -05008258 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008259 return NULL;
8260 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008261 data = PyUnicode_DATA(unicode);
8262 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008263
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 /* Default to Latin-1 */
8265 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008266 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 /* allocate enough for a simple encoding without
8269 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008270 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 if (res == NULL)
8272 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008273 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008277 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008279 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 if (x==enc_EXCEPTION) /* error */
8281 goto onError;
8282 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008283 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 &exc,
8285 &known_errorHandler, &errorHandler, errors,
8286 &res, &respos)) {
8287 goto onError;
8288 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008289 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 else
8291 /* done with this character => adjust input position */
8292 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008295 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008296 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008297 if (_PyBytes_Resize(&res, respos) < 0)
8298 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008299
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 Py_XDECREF(exc);
8301 Py_XDECREF(errorHandler);
8302 return res;
8303
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 Py_XDECREF(res);
8306 Py_XDECREF(exc);
8307 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 return NULL;
8309}
8310
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008311/* Deprecated */
8312PyObject *
8313PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8314 Py_ssize_t size,
8315 PyObject *mapping,
8316 const char *errors)
8317{
8318 PyObject *result;
8319 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8320 if (unicode == NULL)
8321 return NULL;
8322 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8323 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008324 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008325}
8326
Alexander Belopolsky40018472011-02-26 01:02:56 +00008327PyObject *
8328PyUnicode_AsCharmapString(PyObject *unicode,
8329 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330{
8331 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 PyErr_BadArgument();
8333 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008334 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008335 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336}
8337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008338/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008339static void
8340make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008341 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008342 Py_ssize_t startpos, Py_ssize_t endpos,
8343 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008344{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008345 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346 *exceptionObject = _PyUnicodeTranslateError_Create(
8347 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008348 }
8349 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8351 goto onError;
8352 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8353 goto onError;
8354 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8355 goto onError;
8356 return;
8357 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008358 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359 }
8360}
8361
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362/* error handling callback helper:
8363 build arguments, call the callback and check the arguments,
8364 put the result into newpos and return the replacement string, which
8365 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366static PyObject *
8367unicode_translate_call_errorhandler(const char *errors,
8368 PyObject **errorHandler,
8369 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008371 Py_ssize_t startpos, Py_ssize_t endpos,
8372 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008374 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008376 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 PyObject *restuple;
8378 PyObject *resunicode;
8379
8380 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384 }
8385
8386 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390
8391 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008396 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 Py_DECREF(restuple);
8398 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 }
8400 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 &resunicode, &i_newpos)) {
8402 Py_DECREF(restuple);
8403 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008405 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008406 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008407 else
8408 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008410 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 Py_DECREF(restuple);
8412 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008413 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 Py_INCREF(resunicode);
8415 Py_DECREF(restuple);
8416 return resunicode;
8417}
8418
8419/* Lookup the character ch in the mapping and put the result in result,
8420 which must be decrefed by the caller.
8421 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008422static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008423charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424{
Christian Heimes217cfd12007-12-02 14:31:20 +00008425 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 PyObject *x;
8427
8428 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 x = PyObject_GetItem(mapping, w);
8431 Py_DECREF(w);
8432 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8434 /* No mapping found means: use 1:1 mapping. */
8435 PyErr_Clear();
8436 *result = NULL;
8437 return 0;
8438 } else
8439 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008440 }
8441 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 *result = x;
8443 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008445 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008447 if (value < 0 || value > MAX_UNICODE) {
8448 PyErr_Format(PyExc_ValueError,
8449 "character mapping must be in range(0x%x)",
8450 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 Py_DECREF(x);
8452 return -1;
8453 }
8454 *result = x;
8455 return 0;
8456 }
8457 else if (PyUnicode_Check(x)) {
8458 *result = x;
8459 return 0;
8460 }
8461 else {
8462 /* wrong return value */
8463 PyErr_SetString(PyExc_TypeError,
8464 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008465 Py_DECREF(x);
8466 return -1;
8467 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008468}
Victor Stinner1194ea02014-04-04 19:37:40 +02008469
8470/* lookup the character, write the result into the writer.
8471 Return 1 if the result was written into the writer, return 0 if the mapping
8472 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008473static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008474charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8475 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476{
Victor Stinner1194ea02014-04-04 19:37:40 +02008477 PyObject *item;
8478
8479 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008481
8482 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008484 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008487 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008489
8490 if (item == Py_None) {
8491 Py_DECREF(item);
8492 return 0;
8493 }
8494
8495 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008496 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8497 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8498 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008499 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8500 Py_DECREF(item);
8501 return -1;
8502 }
8503 Py_DECREF(item);
8504 return 1;
8505 }
8506
8507 if (!PyUnicode_Check(item)) {
8508 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008510 }
8511
8512 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8513 Py_DECREF(item);
8514 return -1;
8515 }
8516
8517 Py_DECREF(item);
8518 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519}
8520
Victor Stinner89a76ab2014-04-05 11:44:04 +02008521static int
8522unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8523 Py_UCS1 *translate)
8524{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008525 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008526 int ret = 0;
8527
Victor Stinner89a76ab2014-04-05 11:44:04 +02008528 if (charmaptranslate_lookup(ch, mapping, &item)) {
8529 return -1;
8530 }
8531
8532 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008533 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008534 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008535 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008536 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008537 /* not found => default to 1:1 mapping */
8538 translate[ch] = ch;
8539 return 1;
8540 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008541 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008542 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008543 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8544 used it */
8545 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008546 /* invalid character or character outside ASCII:
8547 skip the fast translate */
8548 goto exit;
8549 }
8550 translate[ch] = (Py_UCS1)replace;
8551 }
8552 else if (PyUnicode_Check(item)) {
8553 Py_UCS4 replace;
8554
8555 if (PyUnicode_READY(item) == -1) {
8556 Py_DECREF(item);
8557 return -1;
8558 }
8559 if (PyUnicode_GET_LENGTH(item) != 1)
8560 goto exit;
8561
8562 replace = PyUnicode_READ_CHAR(item, 0);
8563 if (replace > 127)
8564 goto exit;
8565 translate[ch] = (Py_UCS1)replace;
8566 }
8567 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008568 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008569 goto exit;
8570 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008571 ret = 1;
8572
Benjamin Peterson1365de72014-04-07 20:15:41 -04008573 exit:
8574 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008575 return ret;
8576}
8577
8578/* Fast path for ascii => ascii translation. Return 1 if the whole string
8579 was translated into writer, return 0 if the input string was partially
8580 translated into writer, raise an exception and return -1 on error. */
8581static int
8582unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008583 _PyUnicodeWriter *writer, int ignore,
8584 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008585{
Victor Stinner872b2912014-04-05 14:27:07 +02008586 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008587 Py_ssize_t len;
8588 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008589 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008590
Victor Stinner89a76ab2014-04-05 11:44:04 +02008591 len = PyUnicode_GET_LENGTH(input);
8592
Victor Stinner872b2912014-04-05 14:27:07 +02008593 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008594
8595 in = PyUnicode_1BYTE_DATA(input);
8596 end = in + len;
8597
8598 assert(PyUnicode_IS_ASCII(writer->buffer));
8599 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8600 out = PyUnicode_1BYTE_DATA(writer->buffer);
8601
Victor Stinner872b2912014-04-05 14:27:07 +02008602 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008603 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008604 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008605 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008606 int translate = unicode_fast_translate_lookup(mapping, ch,
8607 ascii_table);
8608 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008609 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008610 if (translate == 0)
8611 goto exit;
8612 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008613 }
Victor Stinner872b2912014-04-05 14:27:07 +02008614 if (ch2 == 0xfe) {
8615 if (ignore)
8616 continue;
8617 goto exit;
8618 }
8619 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008620 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008621 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008622 }
Victor Stinner872b2912014-04-05 14:27:07 +02008623 res = 1;
8624
8625exit:
8626 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008627 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008628 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008629}
8630
Alexander Belopolsky40018472011-02-26 01:02:56 +00008631PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632_PyUnicode_TranslateCharmap(PyObject *input,
8633 PyObject *mapping,
8634 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008637 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 Py_ssize_t size, i;
8639 int kind;
8640 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008641 _PyUnicodeWriter writer;
8642 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643 char *reason = "character maps to <undefined>";
8644 PyObject *errorHandler = NULL;
8645 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008646 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008647 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008648
Guido van Rossumd57fd912000-03-10 22:53:23 +00008649 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 PyErr_BadArgument();
8651 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008654 if (PyUnicode_READY(input) == -1)
8655 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008656 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 kind = PyUnicode_KIND(input);
8658 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659
8660 if (size == 0) {
8661 Py_INCREF(input);
8662 return input;
8663 }
8664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665 /* allocate enough for a simple 1:1 translation without
8666 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008667 _PyUnicodeWriter_Init(&writer);
8668 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670
Victor Stinner872b2912014-04-05 14:27:07 +02008671 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8672
Victor Stinner33798672016-03-01 21:59:58 +01008673 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008674 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008675 if (PyUnicode_IS_ASCII(input)) {
8676 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8677 if (res < 0) {
8678 _PyUnicodeWriter_Dealloc(&writer);
8679 return NULL;
8680 }
8681 if (res == 1)
8682 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008683 }
Victor Stinner33798672016-03-01 21:59:58 +01008684 else {
8685 i = 0;
8686 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008690 int translate;
8691 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8692 Py_ssize_t newpos;
8693 /* startpos for collecting untranslatable chars */
8694 Py_ssize_t collstart;
8695 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008696 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697
Victor Stinner1194ea02014-04-04 19:37:40 +02008698 ch = PyUnicode_READ(kind, data, i);
8699 translate = charmaptranslate_output(ch, mapping, &writer);
8700 if (translate < 0)
8701 goto onError;
8702
8703 if (translate != 0) {
8704 /* it worked => adjust input pointer */
8705 ++i;
8706 continue;
8707 }
8708
8709 /* untranslatable character */
8710 collstart = i;
8711 collend = i+1;
8712
8713 /* find all untranslatable characters */
8714 while (collend < size) {
8715 PyObject *x;
8716 ch = PyUnicode_READ(kind, data, collend);
8717 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008718 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008719 Py_XDECREF(x);
8720 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008722 ++collend;
8723 }
8724
8725 if (ignore) {
8726 i = collend;
8727 }
8728 else {
8729 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8730 reason, input, &exc,
8731 collstart, collend, &newpos);
8732 if (repunicode == NULL)
8733 goto onError;
8734 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008736 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008737 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008738 Py_DECREF(repunicode);
8739 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008740 }
8741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742 Py_XDECREF(exc);
8743 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008744 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008745
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008747 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008748 Py_XDECREF(exc);
8749 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750 return NULL;
8751}
8752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753/* Deprecated. Use PyUnicode_Translate instead. */
8754PyObject *
8755PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8756 Py_ssize_t size,
8757 PyObject *mapping,
8758 const char *errors)
8759{
Christian Heimes5f520f42012-09-11 14:03:25 +02008760 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008761 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8762 if (!unicode)
8763 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008764 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8765 Py_DECREF(unicode);
8766 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767}
8768
Alexander Belopolsky40018472011-02-26 01:02:56 +00008769PyObject *
8770PyUnicode_Translate(PyObject *str,
8771 PyObject *mapping,
8772 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008773{
8774 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008775
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776 str = PyUnicode_FromObject(str);
8777 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008778 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008779 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008780 Py_DECREF(str);
8781 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782}
Tim Petersced69f82003-09-16 20:30:58 +00008783
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008785fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008786{
8787 /* No need to call PyUnicode_READY(self) because this function is only
8788 called as a callback from fixup() which does it already. */
8789 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8790 const int kind = PyUnicode_KIND(self);
8791 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008792 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008793 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 Py_ssize_t i;
8795
8796 for (i = 0; i < len; ++i) {
8797 ch = PyUnicode_READ(kind, data, i);
8798 fixed = 0;
8799 if (ch > 127) {
8800 if (Py_UNICODE_ISSPACE(ch))
8801 fixed = ' ';
8802 else {
8803 const int decimal = Py_UNICODE_TODECIMAL(ch);
8804 if (decimal >= 0)
8805 fixed = '0' + decimal;
8806 }
8807 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008808 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008809 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810 PyUnicode_WRITE(kind, data, i, fixed);
8811 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008812 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008813 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008814 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 }
8816
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008817 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818}
8819
8820PyObject *
8821_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8822{
8823 if (!PyUnicode_Check(unicode)) {
8824 PyErr_BadInternalCall();
8825 return NULL;
8826 }
8827 if (PyUnicode_READY(unicode) == -1)
8828 return NULL;
8829 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8830 /* If the string is already ASCII, just return the same string */
8831 Py_INCREF(unicode);
8832 return unicode;
8833 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008834 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835}
8836
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008837PyObject *
8838PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8839 Py_ssize_t length)
8840{
Victor Stinnerf0124502011-11-21 23:12:56 +01008841 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008842 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008843 Py_UCS4 maxchar;
8844 enum PyUnicode_Kind kind;
8845 void *data;
8846
Victor Stinner99d7ad02012-02-22 13:37:39 +01008847 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008848 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008849 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008850 if (ch > 127) {
8851 int decimal = Py_UNICODE_TODECIMAL(ch);
8852 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008853 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008854 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008855 }
8856 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008857
8858 /* Copy to a new string */
8859 decimal = PyUnicode_New(length, maxchar);
8860 if (decimal == NULL)
8861 return decimal;
8862 kind = PyUnicode_KIND(decimal);
8863 data = PyUnicode_DATA(decimal);
8864 /* Iterate over code points */
8865 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008866 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01008867 if (ch > 127) {
8868 int decimal = Py_UNICODE_TODECIMAL(ch);
8869 if (decimal >= 0)
8870 ch = '0' + decimal;
8871 }
8872 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008874 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008875}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008876/* --- Decimal Encoder ---------------------------------------------------- */
8877
Alexander Belopolsky40018472011-02-26 01:02:56 +00008878int
8879PyUnicode_EncodeDecimal(Py_UNICODE *s,
8880 Py_ssize_t length,
8881 char *output,
8882 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008883{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008884 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008885 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008886 enum PyUnicode_Kind kind;
8887 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008888
8889 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 PyErr_BadArgument();
8891 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008892 }
8893
Victor Stinner42bf7752011-11-21 22:52:58 +01008894 unicode = PyUnicode_FromUnicode(s, length);
8895 if (unicode == NULL)
8896 return -1;
8897
Benjamin Petersonbac79492012-01-14 13:34:47 -05008898 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008899 Py_DECREF(unicode);
8900 return -1;
8901 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008902 kind = PyUnicode_KIND(unicode);
8903 data = PyUnicode_DATA(unicode);
8904
Victor Stinnerb84d7232011-11-22 01:50:07 +01008905 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008906 PyObject *exc;
8907 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008909 Py_ssize_t startpos;
8910
8911 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008912
Benjamin Peterson29060642009-01-31 22:14:21 +00008913 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008914 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008915 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008917 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008918 decimal = Py_UNICODE_TODECIMAL(ch);
8919 if (decimal >= 0) {
8920 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008921 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 continue;
8923 }
8924 if (0 < ch && ch < 256) {
8925 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008926 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 continue;
8928 }
Victor Stinner6345be92011-11-25 20:09:01 +01008929
Victor Stinner42bf7752011-11-21 22:52:58 +01008930 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008931 exc = NULL;
8932 raise_encode_exception(&exc, "decimal", unicode,
8933 startpos, startpos+1,
8934 "invalid decimal Unicode string");
8935 Py_XDECREF(exc);
8936 Py_DECREF(unicode);
8937 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008938 }
8939 /* 0-terminate the output string */
8940 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008941 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008942 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008943}
8944
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945/* --- Helpers ------------------------------------------------------------ */
8946
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008947/* helper macro to fixup start/end slice values */
8948#define ADJUST_INDICES(start, end, len) \
8949 if (end > len) \
8950 end = len; \
8951 else if (end < 0) { \
8952 end += len; \
8953 if (end < 0) \
8954 end = 0; \
8955 } \
8956 if (start < 0) { \
8957 start += len; \
8958 if (start < 0) \
8959 start = 0; \
8960 }
8961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008963any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 Py_ssize_t start,
8965 Py_ssize_t end)
8966{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008967 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968 void *buf1, *buf2;
8969 Py_ssize_t len1, len2, result;
8970
8971 kind1 = PyUnicode_KIND(s1);
8972 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008973 if (kind1 < kind2)
8974 return -1;
8975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 len1 = PyUnicode_GET_LENGTH(s1);
8977 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008978 ADJUST_INDICES(start, end, len1);
8979 if (end - start < len2)
8980 return -1;
8981
8982 buf1 = PyUnicode_DATA(s1);
8983 buf2 = PyUnicode_DATA(s2);
8984 if (len2 == 1) {
8985 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
8986 result = findchar((const char *)buf1 + kind1*start,
8987 kind1, end - start, ch, direction);
8988 if (result == -1)
8989 return -1;
8990 else
8991 return start + result;
8992 }
8993
8994 if (kind2 != kind1) {
8995 buf2 = _PyUnicode_AsKind(s2, kind1);
8996 if (!buf2)
8997 return -2;
8998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999
Victor Stinner794d5672011-10-10 03:21:36 +02009000 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009001 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009002 case PyUnicode_1BYTE_KIND:
9003 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9004 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9005 else
9006 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9007 break;
9008 case PyUnicode_2BYTE_KIND:
9009 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9010 break;
9011 case PyUnicode_4BYTE_KIND:
9012 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9013 break;
9014 default:
9015 assert(0); result = -2;
9016 }
9017 }
9018 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009019 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009020 case PyUnicode_1BYTE_KIND:
9021 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9022 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9023 else
9024 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9025 break;
9026 case PyUnicode_2BYTE_KIND:
9027 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9028 break;
9029 case PyUnicode_4BYTE_KIND:
9030 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9031 break;
9032 default:
9033 assert(0); result = -2;
9034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 }
9036
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009037 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009038 PyMem_Free(buf2);
9039
9040 return result;
9041}
9042
9043Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009044_PyUnicode_InsertThousandsGrouping(
9045 PyObject *unicode, Py_ssize_t index,
9046 Py_ssize_t n_buffer,
9047 void *digits, Py_ssize_t n_digits,
9048 Py_ssize_t min_width,
9049 const char *grouping, PyObject *thousands_sep,
9050 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051{
Victor Stinner41a863c2012-02-24 00:37:51 +01009052 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009053 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009054 Py_ssize_t thousands_sep_len;
9055 Py_ssize_t len;
9056
9057 if (unicode != NULL) {
9058 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009059 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009060 }
9061 else {
9062 kind = PyUnicode_1BYTE_KIND;
9063 data = NULL;
9064 }
9065 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9066 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9067 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9068 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009069 if (thousands_sep_kind < kind) {
9070 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9071 if (!thousands_sep_data)
9072 return -1;
9073 }
9074 else {
9075 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9076 if (!data)
9077 return -1;
9078 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009079 }
9080
Benjamin Petersonead6b532011-12-20 17:23:42 -06009081 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009083 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009084 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009085 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009086 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009087 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009088 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009089 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009090 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009091 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009092 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009093 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009095 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009096 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009097 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009098 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009099 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009101 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009102 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009103 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009104 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009105 break;
9106 default:
9107 assert(0);
9108 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009110 if (unicode != NULL && thousands_sep_kind != kind) {
9111 if (thousands_sep_kind < kind)
9112 PyMem_Free(thousands_sep_data);
9113 else
9114 PyMem_Free(data);
9115 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009116 if (unicode == NULL) {
9117 *maxchar = 127;
9118 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009119 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009120 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009121 }
9122 }
9123 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124}
9125
9126
Alexander Belopolsky40018472011-02-26 01:02:56 +00009127Py_ssize_t
9128PyUnicode_Count(PyObject *str,
9129 PyObject *substr,
9130 Py_ssize_t start,
9131 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009133 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009134 PyObject* str_obj;
9135 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009136 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009137 void *buf1 = NULL, *buf2 = NULL;
9138 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009139
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009140 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009141 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009143 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009144 if (!sub_obj) {
9145 Py_DECREF(str_obj);
9146 return -1;
9147 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009148 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009149 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009150 Py_DECREF(str_obj);
9151 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 }
Tim Petersced69f82003-09-16 20:30:58 +00009153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 kind1 = PyUnicode_KIND(str_obj);
9155 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009156 if (kind1 < kind2) {
9157 Py_DECREF(sub_obj);
9158 Py_DECREF(str_obj);
9159 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009160 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009162 len1 = PyUnicode_GET_LENGTH(str_obj);
9163 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009165 if (end - start < len2) {
9166 Py_DECREF(sub_obj);
9167 Py_DECREF(str_obj);
9168 return 0;
9169 }
9170
9171 buf1 = PyUnicode_DATA(str_obj);
9172 buf2 = PyUnicode_DATA(sub_obj);
9173 if (kind2 != kind1) {
9174 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9175 if (!buf2)
9176 goto onError;
9177 }
9178
9179 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009181 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9182 result = asciilib_count(
9183 ((Py_UCS1*)buf1) + start, end - start,
9184 buf2, len2, PY_SSIZE_T_MAX
9185 );
9186 else
9187 result = ucs1lib_count(
9188 ((Py_UCS1*)buf1) + start, end - start,
9189 buf2, len2, PY_SSIZE_T_MAX
9190 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191 break;
9192 case PyUnicode_2BYTE_KIND:
9193 result = ucs2lib_count(
9194 ((Py_UCS2*)buf1) + start, end - start,
9195 buf2, len2, PY_SSIZE_T_MAX
9196 );
9197 break;
9198 case PyUnicode_4BYTE_KIND:
9199 result = ucs4lib_count(
9200 ((Py_UCS4*)buf1) + start, end - start,
9201 buf2, len2, PY_SSIZE_T_MAX
9202 );
9203 break;
9204 default:
9205 assert(0); result = 0;
9206 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009207
9208 Py_DECREF(sub_obj);
9209 Py_DECREF(str_obj);
9210
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009211 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009212 PyMem_Free(buf2);
9213
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009215 onError:
9216 Py_DECREF(sub_obj);
9217 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009218 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 PyMem_Free(buf2);
9220 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221}
9222
Alexander Belopolsky40018472011-02-26 01:02:56 +00009223Py_ssize_t
9224PyUnicode_Find(PyObject *str,
9225 PyObject *sub,
9226 Py_ssize_t start,
9227 Py_ssize_t end,
9228 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009230 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009231
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009233 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009234 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009235 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009236 if (!sub) {
9237 Py_DECREF(str);
9238 return -2;
9239 }
9240 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9241 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009242 Py_DECREF(str);
9243 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244 }
Tim Petersced69f82003-09-16 20:30:58 +00009245
Victor Stinner794d5672011-10-10 03:21:36 +02009246 result = any_find_slice(direction,
9247 str, sub, start, end
9248 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009249
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009251 Py_DECREF(sub);
9252
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253 return result;
9254}
9255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256Py_ssize_t
9257PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9258 Py_ssize_t start, Py_ssize_t end,
9259 int direction)
9260{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009262 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 if (PyUnicode_READY(str) == -1)
9264 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009265 if (start < 0 || end < 0) {
9266 PyErr_SetString(PyExc_IndexError, "string index out of range");
9267 return -2;
9268 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 if (end > PyUnicode_GET_LENGTH(str))
9270 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009271 if (start >= end)
9272 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009274 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9275 kind, end-start, ch, direction);
9276 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009278 else
9279 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280}
9281
Alexander Belopolsky40018472011-02-26 01:02:56 +00009282static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009283tailmatch(PyObject *self,
9284 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009285 Py_ssize_t start,
9286 Py_ssize_t end,
9287 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 int kind_self;
9290 int kind_sub;
9291 void *data_self;
9292 void *data_sub;
9293 Py_ssize_t offset;
9294 Py_ssize_t i;
9295 Py_ssize_t end_sub;
9296
9297 if (PyUnicode_READY(self) == -1 ||
9298 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009299 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9302 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009303 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009304 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009306 if (PyUnicode_GET_LENGTH(substring) == 0)
9307 return 1;
9308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 kind_self = PyUnicode_KIND(self);
9310 data_self = PyUnicode_DATA(self);
9311 kind_sub = PyUnicode_KIND(substring);
9312 data_sub = PyUnicode_DATA(substring);
9313 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9314
9315 if (direction > 0)
9316 offset = end;
9317 else
9318 offset = start;
9319
9320 if (PyUnicode_READ(kind_self, data_self, offset) ==
9321 PyUnicode_READ(kind_sub, data_sub, 0) &&
9322 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9323 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9324 /* If both are of the same kind, memcmp is sufficient */
9325 if (kind_self == kind_sub) {
9326 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009327 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328 data_sub,
9329 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009330 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009332 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 else {
9334 /* We do not need to compare 0 and len(substring)-1 because
9335 the if statement above ensured already that they are equal
9336 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 for (i = 1; i < end_sub; ++i) {
9338 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9339 PyUnicode_READ(kind_sub, data_sub, i))
9340 return 0;
9341 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009343 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344 }
9345
9346 return 0;
9347}
9348
Alexander Belopolsky40018472011-02-26 01:02:56 +00009349Py_ssize_t
9350PyUnicode_Tailmatch(PyObject *str,
9351 PyObject *substr,
9352 Py_ssize_t start,
9353 Py_ssize_t end,
9354 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009356 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009357
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358 str = PyUnicode_FromObject(str);
9359 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009360 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009361 substr = PyUnicode_FromObject(substr);
9362 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009363 Py_DECREF(str);
9364 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 }
Tim Petersced69f82003-09-16 20:30:58 +00009366
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009367 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009368 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369 Py_DECREF(str);
9370 Py_DECREF(substr);
9371 return result;
9372}
9373
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374/* Apply fixfct filter to the Unicode object self and return a
9375 reference to the modified object */
9376
Alexander Belopolsky40018472011-02-26 01:02:56 +00009377static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009378fixup(PyObject *self,
9379 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 PyObject *u;
9382 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009383 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009385 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009386 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009388 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 /* fix functions return the new maximum character in a string,
9391 if the kind of the resulting unicode object does not change,
9392 everything is fine. Otherwise we need to change the string kind
9393 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009394 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009395
9396 if (maxchar_new == 0) {
9397 /* no changes */;
9398 if (PyUnicode_CheckExact(self)) {
9399 Py_DECREF(u);
9400 Py_INCREF(self);
9401 return self;
9402 }
9403 else
9404 return u;
9405 }
9406
Victor Stinnere6abb482012-05-02 01:15:40 +02009407 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408
Victor Stinnereaab6042011-12-11 22:22:39 +01009409 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009411
9412 /* In case the maximum character changed, we need to
9413 convert the string to the new category. */
9414 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9415 if (v == NULL) {
9416 Py_DECREF(u);
9417 return NULL;
9418 }
9419 if (maxchar_new > maxchar_old) {
9420 /* If the maxchar increased so that the kind changed, not all
9421 characters are representable anymore and we need to fix the
9422 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009423 _PyUnicode_FastCopyCharacters(v, 0,
9424 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009425 maxchar_old = fixfct(v);
9426 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 }
9428 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009429 _PyUnicode_FastCopyCharacters(v, 0,
9430 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009432 Py_DECREF(u);
9433 assert(_PyUnicode_CheckConsistency(v, 1));
9434 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435}
9436
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009437static PyObject *
9438ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009440 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9441 char *resdata, *data = PyUnicode_DATA(self);
9442 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009443
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009444 res = PyUnicode_New(len, 127);
9445 if (res == NULL)
9446 return NULL;
9447 resdata = PyUnicode_DATA(res);
9448 if (lower)
9449 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009451 _Py_bytes_upper(resdata, data, len);
9452 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453}
9454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009456handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009458 Py_ssize_t j;
9459 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009460 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009461 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009462
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009463 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9464
9465 where ! is a negation and \p{xxx} is a character with property xxx.
9466 */
9467 for (j = i - 1; j >= 0; j--) {
9468 c = PyUnicode_READ(kind, data, j);
9469 if (!_PyUnicode_IsCaseIgnorable(c))
9470 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009472 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9473 if (final_sigma) {
9474 for (j = i + 1; j < length; j++) {
9475 c = PyUnicode_READ(kind, data, j);
9476 if (!_PyUnicode_IsCaseIgnorable(c))
9477 break;
9478 }
9479 final_sigma = j == length || !_PyUnicode_IsCased(c);
9480 }
9481 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482}
9483
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009484static int
9485lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9486 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009488 /* Obscure special case. */
9489 if (c == 0x3A3) {
9490 mapped[0] = handle_capital_sigma(kind, data, length, i);
9491 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009493 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494}
9495
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009496static Py_ssize_t
9497do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009499 Py_ssize_t i, k = 0;
9500 int n_res, j;
9501 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009502
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009503 c = PyUnicode_READ(kind, data, 0);
9504 n_res = _PyUnicode_ToUpperFull(c, mapped);
9505 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009506 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009507 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009509 for (i = 1; i < length; i++) {
9510 c = PyUnicode_READ(kind, data, i);
9511 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9512 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009513 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009514 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009515 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009516 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009517 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518}
9519
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009520static Py_ssize_t
9521do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9522 Py_ssize_t i, k = 0;
9523
9524 for (i = 0; i < length; i++) {
9525 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9526 int n_res, j;
9527 if (Py_UNICODE_ISUPPER(c)) {
9528 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9529 }
9530 else if (Py_UNICODE_ISLOWER(c)) {
9531 n_res = _PyUnicode_ToUpperFull(c, mapped);
9532 }
9533 else {
9534 n_res = 1;
9535 mapped[0] = c;
9536 }
9537 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009538 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009539 res[k++] = mapped[j];
9540 }
9541 }
9542 return k;
9543}
9544
9545static Py_ssize_t
9546do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9547 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009548{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009549 Py_ssize_t i, k = 0;
9550
9551 for (i = 0; i < length; i++) {
9552 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9553 int n_res, j;
9554 if (lower)
9555 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9556 else
9557 n_res = _PyUnicode_ToUpperFull(c, mapped);
9558 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009559 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009560 res[k++] = mapped[j];
9561 }
9562 }
9563 return k;
9564}
9565
9566static Py_ssize_t
9567do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9568{
9569 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9570}
9571
9572static Py_ssize_t
9573do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9574{
9575 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9576}
9577
Benjamin Petersone51757f2012-01-12 21:10:29 -05009578static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009579do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9580{
9581 Py_ssize_t i, k = 0;
9582
9583 for (i = 0; i < length; i++) {
9584 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9585 Py_UCS4 mapped[3];
9586 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9587 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009588 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009589 res[k++] = mapped[j];
9590 }
9591 }
9592 return k;
9593}
9594
9595static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009596do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9597{
9598 Py_ssize_t i, k = 0;
9599 int previous_is_cased;
9600
9601 previous_is_cased = 0;
9602 for (i = 0; i < length; i++) {
9603 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9604 Py_UCS4 mapped[3];
9605 int n_res, j;
9606
9607 if (previous_is_cased)
9608 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9609 else
9610 n_res = _PyUnicode_ToTitleFull(c, mapped);
9611
9612 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009613 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009614 res[k++] = mapped[j];
9615 }
9616
9617 previous_is_cased = _PyUnicode_IsCased(c);
9618 }
9619 return k;
9620}
9621
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009622static PyObject *
9623case_operation(PyObject *self,
9624 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9625{
9626 PyObject *res = NULL;
9627 Py_ssize_t length, newlength = 0;
9628 int kind, outkind;
9629 void *data, *outdata;
9630 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9631
Benjamin Petersoneea48462012-01-16 14:28:50 -05009632 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009633
9634 kind = PyUnicode_KIND(self);
9635 data = PyUnicode_DATA(self);
9636 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009637 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009638 PyErr_SetString(PyExc_OverflowError, "string is too long");
9639 return NULL;
9640 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009641 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642 if (tmp == NULL)
9643 return PyErr_NoMemory();
9644 newlength = perform(kind, data, length, tmp, &maxchar);
9645 res = PyUnicode_New(newlength, maxchar);
9646 if (res == NULL)
9647 goto leave;
9648 tmpend = tmp + newlength;
9649 outdata = PyUnicode_DATA(res);
9650 outkind = PyUnicode_KIND(res);
9651 switch (outkind) {
9652 case PyUnicode_1BYTE_KIND:
9653 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9654 break;
9655 case PyUnicode_2BYTE_KIND:
9656 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9657 break;
9658 case PyUnicode_4BYTE_KIND:
9659 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9660 break;
9661 default:
9662 assert(0);
9663 break;
9664 }
9665 leave:
9666 PyMem_FREE(tmp);
9667 return res;
9668}
9669
Tim Peters8ce9f162004-08-27 01:49:32 +00009670PyObject *
9671PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009674 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009676 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009677 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9678 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009679 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009681 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009683 int use_memcpy;
9684 unsigned char *res_data = NULL, *sep_data = NULL;
9685 PyObject *last_obj;
9686 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009688 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009689 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009690 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009691 }
9692
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009693 /* NOTE: the following code can't call back into Python code,
9694 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009695 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009696
Tim Peters05eba1f2004-08-27 21:32:02 +00009697 seqlen = PySequence_Fast_GET_SIZE(fseq);
9698 /* If empty sequence, return u"". */
9699 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009700 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009701 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009702 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009703
Tim Peters05eba1f2004-08-27 21:32:02 +00009704 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009705 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009706 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009707 if (seqlen == 1) {
9708 if (PyUnicode_CheckExact(items[0])) {
9709 res = items[0];
9710 Py_INCREF(res);
9711 Py_DECREF(fseq);
9712 return res;
9713 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009714 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009715 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009716 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009717 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009718 /* Set up sep and seplen */
9719 if (separator == NULL) {
9720 /* fall back to a blank space separator */
9721 sep = PyUnicode_FromOrdinal(' ');
9722 if (!sep)
9723 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009724 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009725 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009726 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009727 else {
9728 if (!PyUnicode_Check(separator)) {
9729 PyErr_Format(PyExc_TypeError,
9730 "separator: expected str instance,"
9731 " %.80s found",
9732 Py_TYPE(separator)->tp_name);
9733 goto onError;
9734 }
9735 if (PyUnicode_READY(separator))
9736 goto onError;
9737 sep = separator;
9738 seplen = PyUnicode_GET_LENGTH(separator);
9739 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9740 /* inc refcount to keep this code path symmetric with the
9741 above case of a blank separator */
9742 Py_INCREF(sep);
9743 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009744 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009745 }
9746
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009747 /* There are at least two things to join, or else we have a subclass
9748 * of str in the sequence.
9749 * Do a pre-pass to figure out the total amount of space we'll
9750 * need (sz), and see whether all argument are strings.
9751 */
9752 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009753#ifdef Py_DEBUG
9754 use_memcpy = 0;
9755#else
9756 use_memcpy = 1;
9757#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009758 for (i = 0; i < seqlen; i++) {
9759 const Py_ssize_t old_sz = sz;
9760 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 if (!PyUnicode_Check(item)) {
9762 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009763 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009764 " %.80s found",
9765 i, Py_TYPE(item)->tp_name);
9766 goto onError;
9767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 if (PyUnicode_READY(item) == -1)
9769 goto onError;
9770 sz += PyUnicode_GET_LENGTH(item);
9771 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009772 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009773 if (i != 0)
9774 sz += seplen;
9775 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9776 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009777 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009778 goto onError;
9779 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009780 if (use_memcpy && last_obj != NULL) {
9781 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9782 use_memcpy = 0;
9783 }
9784 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009785 }
Tim Petersced69f82003-09-16 20:30:58 +00009786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009788 if (res == NULL)
9789 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009790
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009791 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009792#ifdef Py_DEBUG
9793 use_memcpy = 0;
9794#else
9795 if (use_memcpy) {
9796 res_data = PyUnicode_1BYTE_DATA(res);
9797 kind = PyUnicode_KIND(res);
9798 if (seplen != 0)
9799 sep_data = PyUnicode_1BYTE_DATA(sep);
9800 }
9801#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009802 if (use_memcpy) {
9803 for (i = 0; i < seqlen; ++i) {
9804 Py_ssize_t itemlen;
9805 item = items[i];
9806
9807 /* Copy item, and maybe the separator. */
9808 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009809 Py_MEMCPY(res_data,
9810 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009811 kind * seplen);
9812 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009813 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009814
9815 itemlen = PyUnicode_GET_LENGTH(item);
9816 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009817 Py_MEMCPY(res_data,
9818 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009819 kind * itemlen);
9820 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009821 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009822 }
9823 assert(res_data == PyUnicode_1BYTE_DATA(res)
9824 + kind * PyUnicode_GET_LENGTH(res));
9825 }
9826 else {
9827 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9828 Py_ssize_t itemlen;
9829 item = items[i];
9830
9831 /* Copy item, and maybe the separator. */
9832 if (i && seplen != 0) {
9833 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9834 res_offset += seplen;
9835 }
9836
9837 itemlen = PyUnicode_GET_LENGTH(item);
9838 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009839 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009840 res_offset += itemlen;
9841 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009842 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009843 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009844 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009845
Tim Peters05eba1f2004-08-27 21:32:02 +00009846 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009848 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850
Benjamin Peterson29060642009-01-31 22:14:21 +00009851 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009852 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009854 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855 return NULL;
9856}
9857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858#define FILL(kind, data, value, start, length) \
9859 do { \
9860 Py_ssize_t i_ = 0; \
9861 assert(kind != PyUnicode_WCHAR_KIND); \
9862 switch ((kind)) { \
9863 case PyUnicode_1BYTE_KIND: { \
9864 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009865 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 break; \
9867 } \
9868 case PyUnicode_2BYTE_KIND: { \
9869 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9870 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9871 break; \
9872 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009873 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9875 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9876 break; \
9877 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009878 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 } \
9880 } while (0)
9881
Victor Stinnerd3f08822012-05-29 12:57:52 +02009882void
9883_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9884 Py_UCS4 fill_char)
9885{
9886 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9887 const void *data = PyUnicode_DATA(unicode);
9888 assert(PyUnicode_IS_READY(unicode));
9889 assert(unicode_modifiable(unicode));
9890 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9891 assert(start >= 0);
9892 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9893 FILL(kind, data, fill_char, start, length);
9894}
9895
Victor Stinner3fe55312012-01-04 00:33:50 +01009896Py_ssize_t
9897PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9898 Py_UCS4 fill_char)
9899{
9900 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009901
9902 if (!PyUnicode_Check(unicode)) {
9903 PyErr_BadInternalCall();
9904 return -1;
9905 }
9906 if (PyUnicode_READY(unicode) == -1)
9907 return -1;
9908 if (unicode_check_modifiable(unicode))
9909 return -1;
9910
Victor Stinnerd3f08822012-05-29 12:57:52 +02009911 if (start < 0) {
9912 PyErr_SetString(PyExc_IndexError, "string index out of range");
9913 return -1;
9914 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009915 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9916 PyErr_SetString(PyExc_ValueError,
9917 "fill character is bigger than "
9918 "the string maximum character");
9919 return -1;
9920 }
9921
9922 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9923 length = Py_MIN(maxlen, length);
9924 if (length <= 0)
9925 return 0;
9926
Victor Stinnerd3f08822012-05-29 12:57:52 +02009927 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009928 return length;
9929}
9930
Victor Stinner9310abb2011-10-05 00:59:23 +02009931static PyObject *
9932pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009933 Py_ssize_t left,
9934 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 PyObject *u;
9938 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009939 int kind;
9940 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941
9942 if (left < 0)
9943 left = 0;
9944 if (right < 0)
9945 right = 0;
9946
Victor Stinnerc4b49542011-12-11 22:44:26 +01009947 if (left == 0 && right == 0)
9948 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9951 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009952 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9953 return NULL;
9954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009956 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009958 if (!u)
9959 return NULL;
9960
9961 kind = PyUnicode_KIND(u);
9962 data = PyUnicode_DATA(u);
9963 if (left)
9964 FILL(kind, data, fill, 0, left);
9965 if (right)
9966 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009967 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009968 assert(_PyUnicode_CheckConsistency(u, 1));
9969 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970}
9971
Alexander Belopolsky40018472011-02-26 01:02:56 +00009972PyObject *
9973PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009974{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976
9977 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009978 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009979 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009980 if (PyUnicode_READY(string) == -1) {
9981 Py_DECREF(string);
9982 return NULL;
9983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984
Benjamin Petersonead6b532011-12-20 17:23:42 -06009985 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009987 if (PyUnicode_IS_ASCII(string))
9988 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009989 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009990 PyUnicode_GET_LENGTH(string), keepends);
9991 else
9992 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009993 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009994 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 break;
9996 case PyUnicode_2BYTE_KIND:
9997 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009998 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 PyUnicode_GET_LENGTH(string), keepends);
10000 break;
10001 case PyUnicode_4BYTE_KIND:
10002 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010003 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 PyUnicode_GET_LENGTH(string), keepends);
10005 break;
10006 default:
10007 assert(0);
10008 list = 0;
10009 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010 Py_DECREF(string);
10011 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012}
10013
Alexander Belopolsky40018472011-02-26 01:02:56 +000010014static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010015split(PyObject *self,
10016 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010017 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010019 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 void *buf1, *buf2;
10021 Py_ssize_t len1, len2;
10022 PyObject* out;
10023
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010025 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 if (PyUnicode_READY(self) == -1)
10028 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010031 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010033 if (PyUnicode_IS_ASCII(self))
10034 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010035 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010036 PyUnicode_GET_LENGTH(self), maxcount
10037 );
10038 else
10039 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010040 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010041 PyUnicode_GET_LENGTH(self), maxcount
10042 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 case PyUnicode_2BYTE_KIND:
10044 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010045 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 PyUnicode_GET_LENGTH(self), maxcount
10047 );
10048 case PyUnicode_4BYTE_KIND:
10049 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010050 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 PyUnicode_GET_LENGTH(self), maxcount
10052 );
10053 default:
10054 assert(0);
10055 return NULL;
10056 }
10057
10058 if (PyUnicode_READY(substring) == -1)
10059 return NULL;
10060
10061 kind1 = PyUnicode_KIND(self);
10062 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 len1 = PyUnicode_GET_LENGTH(self);
10064 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010065 if (kind1 < kind2 || len1 < len2) {
10066 out = PyList_New(1);
10067 if (out == NULL)
10068 return NULL;
10069 Py_INCREF(self);
10070 PyList_SET_ITEM(out, 0, self);
10071 return out;
10072 }
10073 buf1 = PyUnicode_DATA(self);
10074 buf2 = PyUnicode_DATA(substring);
10075 if (kind2 != kind1) {
10076 buf2 = _PyUnicode_AsKind(substring, kind1);
10077 if (!buf2)
10078 return NULL;
10079 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010081 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010083 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10084 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010085 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010086 else
10087 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010088 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 break;
10090 case PyUnicode_2BYTE_KIND:
10091 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010092 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 break;
10094 case PyUnicode_4BYTE_KIND:
10095 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010096 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 break;
10098 default:
10099 out = NULL;
10100 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010101 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 PyMem_Free(buf2);
10103 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104}
10105
Alexander Belopolsky40018472011-02-26 01:02:56 +000010106static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010107rsplit(PyObject *self,
10108 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010109 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010110{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010111 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 void *buf1, *buf2;
10113 Py_ssize_t len1, len2;
10114 PyObject* out;
10115
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010116 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010117 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 if (PyUnicode_READY(self) == -1)
10120 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010123 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010125 if (PyUnicode_IS_ASCII(self))
10126 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010127 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010128 PyUnicode_GET_LENGTH(self), maxcount
10129 );
10130 else
10131 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010132 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010133 PyUnicode_GET_LENGTH(self), maxcount
10134 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 case PyUnicode_2BYTE_KIND:
10136 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010137 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 PyUnicode_GET_LENGTH(self), maxcount
10139 );
10140 case PyUnicode_4BYTE_KIND:
10141 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010142 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 PyUnicode_GET_LENGTH(self), maxcount
10144 );
10145 default:
10146 assert(0);
10147 return NULL;
10148 }
10149
10150 if (PyUnicode_READY(substring) == -1)
10151 return NULL;
10152
10153 kind1 = PyUnicode_KIND(self);
10154 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 len1 = PyUnicode_GET_LENGTH(self);
10156 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010157 if (kind1 < kind2 || len1 < len2) {
10158 out = PyList_New(1);
10159 if (out == NULL)
10160 return NULL;
10161 Py_INCREF(self);
10162 PyList_SET_ITEM(out, 0, self);
10163 return out;
10164 }
10165 buf1 = PyUnicode_DATA(self);
10166 buf2 = PyUnicode_DATA(substring);
10167 if (kind2 != kind1) {
10168 buf2 = _PyUnicode_AsKind(substring, kind1);
10169 if (!buf2)
10170 return NULL;
10171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010173 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010175 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10176 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010177 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010178 else
10179 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010180 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 break;
10182 case PyUnicode_2BYTE_KIND:
10183 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010184 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 break;
10186 case PyUnicode_4BYTE_KIND:
10187 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010188 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 break;
10190 default:
10191 out = NULL;
10192 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010193 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 PyMem_Free(buf2);
10195 return out;
10196}
10197
10198static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010199anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10200 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010202 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010204 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10205 return asciilib_find(buf1, len1, buf2, len2, offset);
10206 else
10207 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208 case PyUnicode_2BYTE_KIND:
10209 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10210 case PyUnicode_4BYTE_KIND:
10211 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10212 }
10213 assert(0);
10214 return -1;
10215}
10216
10217static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10219 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010221 switch (kind) {
10222 case PyUnicode_1BYTE_KIND:
10223 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10224 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10225 else
10226 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10227 case PyUnicode_2BYTE_KIND:
10228 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10229 case PyUnicode_4BYTE_KIND:
10230 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10231 }
10232 assert(0);
10233 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010234}
10235
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010236static void
10237replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10238 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10239{
10240 int kind = PyUnicode_KIND(u);
10241 void *data = PyUnicode_DATA(u);
10242 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10243 if (kind == PyUnicode_1BYTE_KIND) {
10244 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10245 (Py_UCS1 *)data + len,
10246 u1, u2, maxcount);
10247 }
10248 else if (kind == PyUnicode_2BYTE_KIND) {
10249 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10250 (Py_UCS2 *)data + len,
10251 u1, u2, maxcount);
10252 }
10253 else {
10254 assert(kind == PyUnicode_4BYTE_KIND);
10255 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10256 (Py_UCS4 *)data + len,
10257 u1, u2, maxcount);
10258 }
10259}
10260
Alexander Belopolsky40018472011-02-26 01:02:56 +000010261static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262replace(PyObject *self, PyObject *str1,
10263 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 PyObject *u;
10266 char *sbuf = PyUnicode_DATA(self);
10267 char *buf1 = PyUnicode_DATA(str1);
10268 char *buf2 = PyUnicode_DATA(str2);
10269 int srelease = 0, release1 = 0, release2 = 0;
10270 int skind = PyUnicode_KIND(self);
10271 int kind1 = PyUnicode_KIND(str1);
10272 int kind2 = PyUnicode_KIND(str2);
10273 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10274 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10275 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010276 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010277 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278
10279 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010282 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283
Victor Stinner59de0ee2011-10-07 10:01:28 +020010284 if (str1 == str2)
10285 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286
Victor Stinner49a0a212011-10-12 23:46:10 +020010287 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010288 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10289 if (maxchar < maxchar_str1)
10290 /* substring too wide to be present */
10291 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010292 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10293 /* Replacing str1 with str2 may cause a maxchar reduction in the
10294 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010295 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010296 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010299 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010301 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010303 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010304 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010305 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010306
Victor Stinner69ed0f42013-04-09 21:48:24 +020010307 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010308 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010309 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010310 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010311 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010313 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010315
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010316 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10317 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010318 }
10319 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 int rkind = skind;
10321 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010322 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 if (kind1 < rkind) {
10325 /* widen substring */
10326 buf1 = _PyUnicode_AsKind(str1, rkind);
10327 if (!buf1) goto error;
10328 release1 = 1;
10329 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010330 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010331 if (i < 0)
10332 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 if (rkind > kind2) {
10334 /* widen replacement */
10335 buf2 = _PyUnicode_AsKind(str2, rkind);
10336 if (!buf2) goto error;
10337 release2 = 1;
10338 }
10339 else if (rkind < kind2) {
10340 /* widen self and buf1 */
10341 rkind = kind2;
10342 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010343 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 sbuf = _PyUnicode_AsKind(self, rkind);
10345 if (!sbuf) goto error;
10346 srelease = 1;
10347 buf1 = _PyUnicode_AsKind(str1, rkind);
10348 if (!buf1) goto error;
10349 release1 = 1;
10350 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010351 u = PyUnicode_New(slen, maxchar);
10352 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010354 assert(PyUnicode_KIND(u) == rkind);
10355 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010356
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010357 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010358 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010359 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010361 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010363
10364 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010365 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010366 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010367 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010368 if (i == -1)
10369 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010370 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010372 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010376 }
10377 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010379 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 int rkind = skind;
10381 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010384 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 buf1 = _PyUnicode_AsKind(str1, rkind);
10386 if (!buf1) goto error;
10387 release1 = 1;
10388 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010389 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010390 if (n == 0)
10391 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010393 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 buf2 = _PyUnicode_AsKind(str2, rkind);
10395 if (!buf2) goto error;
10396 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010399 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 rkind = kind2;
10401 sbuf = _PyUnicode_AsKind(self, rkind);
10402 if (!sbuf) goto error;
10403 srelease = 1;
10404 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010405 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 buf1 = _PyUnicode_AsKind(str1, rkind);
10407 if (!buf1) goto error;
10408 release1 = 1;
10409 }
10410 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10411 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010412 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 PyErr_SetString(PyExc_OverflowError,
10414 "replace string is too long");
10415 goto error;
10416 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010417 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010418 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010419 _Py_INCREF_UNICODE_EMPTY();
10420 if (!unicode_empty)
10421 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010422 u = unicode_empty;
10423 goto done;
10424 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010425 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 PyErr_SetString(PyExc_OverflowError,
10427 "replace string is too long");
10428 goto error;
10429 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010430 u = PyUnicode_New(new_size, maxchar);
10431 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010433 assert(PyUnicode_KIND(u) == rkind);
10434 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 ires = i = 0;
10436 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010437 while (n-- > 0) {
10438 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010439 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010440 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010441 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010442 if (j == -1)
10443 break;
10444 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010445 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010446 memcpy(res + rkind * ires,
10447 sbuf + rkind * i,
10448 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010450 }
10451 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010453 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010455 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010462 memcpy(res + rkind * ires,
10463 sbuf + rkind * i,
10464 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010465 }
10466 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010467 /* interleave */
10468 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010469 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010471 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010473 if (--n <= 0)
10474 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010475 memcpy(res + rkind * ires,
10476 sbuf + rkind * i,
10477 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 ires++;
10479 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010480 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010481 memcpy(res + rkind * ires,
10482 sbuf + rkind * i,
10483 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010484 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010485 }
10486
10487 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010488 unicode_adjust_maxchar(&u);
10489 if (u == NULL)
10490 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010492
10493 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 if (srelease)
10495 PyMem_FREE(sbuf);
10496 if (release1)
10497 PyMem_FREE(buf1);
10498 if (release2)
10499 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010500 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010502
Benjamin Peterson29060642009-01-31 22:14:21 +000010503 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010504 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 if (srelease)
10506 PyMem_FREE(sbuf);
10507 if (release1)
10508 PyMem_FREE(buf1);
10509 if (release2)
10510 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010511 return unicode_result_unchanged(self);
10512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 error:
10514 if (srelease && sbuf)
10515 PyMem_FREE(sbuf);
10516 if (release1 && buf1)
10517 PyMem_FREE(buf1);
10518 if (release2 && buf2)
10519 PyMem_FREE(buf2);
10520 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521}
10522
10523/* --- Unicode Object Methods --------------------------------------------- */
10524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010525PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010526 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527\n\
10528Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010529characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530
10531static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010532unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010534 if (PyUnicode_READY(self) == -1)
10535 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010536 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537}
10538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010539PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010540 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541\n\
10542Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010543have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544
10545static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010546unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010548 if (PyUnicode_READY(self) == -1)
10549 return NULL;
10550 if (PyUnicode_GET_LENGTH(self) == 0)
10551 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010552 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553}
10554
Benjamin Petersond5890c82012-01-14 13:23:30 -050010555PyDoc_STRVAR(casefold__doc__,
10556 "S.casefold() -> str\n\
10557\n\
10558Return a version of S suitable for caseless comparisons.");
10559
10560static PyObject *
10561unicode_casefold(PyObject *self)
10562{
10563 if (PyUnicode_READY(self) == -1)
10564 return NULL;
10565 if (PyUnicode_IS_ASCII(self))
10566 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010567 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010568}
10569
10570
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010571/* Argument converter. Coerces to a single unicode character */
10572
10573static int
10574convert_uc(PyObject *obj, void *addr)
10575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010577 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010578
Benjamin Peterson14339b62009-01-31 16:36:08 +000010579 uniobj = PyUnicode_FromObject(obj);
10580 if (uniobj == NULL) {
10581 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010582 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010583 return 0;
10584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010586 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010587 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010588 Py_DECREF(uniobj);
10589 return 0;
10590 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010592 Py_DECREF(uniobj);
10593 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010594}
10595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010596PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010597 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010599Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010600done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601
10602static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010603unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010605 Py_ssize_t marg, left;
10606 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 Py_UCS4 fillchar = ' ';
10608
Victor Stinnere9a29352011-10-01 02:14:59 +020010609 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
Benjamin Petersonbac79492012-01-14 13:34:47 -050010612 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613 return NULL;
10614
Victor Stinnerc4b49542011-12-11 22:44:26 +010010615 if (PyUnicode_GET_LENGTH(self) >= width)
10616 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617
Victor Stinnerc4b49542011-12-11 22:44:26 +010010618 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619 left = marg / 2 + (marg & width & 1);
10620
Victor Stinner9310abb2011-10-05 00:59:23 +020010621 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622}
10623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624/* This function assumes that str1 and str2 are readied by the caller. */
10625
Marc-André Lemburge5034372000-08-08 08:04:29 +000010626static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010627unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010628{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010629#define COMPARE(TYPE1, TYPE2) \
10630 do { \
10631 TYPE1* p1 = (TYPE1 *)data1; \
10632 TYPE2* p2 = (TYPE2 *)data2; \
10633 TYPE1* end = p1 + len; \
10634 Py_UCS4 c1, c2; \
10635 for (; p1 != end; p1++, p2++) { \
10636 c1 = *p1; \
10637 c2 = *p2; \
10638 if (c1 != c2) \
10639 return (c1 < c2) ? -1 : 1; \
10640 } \
10641 } \
10642 while (0)
10643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 int kind1, kind2;
10645 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010646 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 kind1 = PyUnicode_KIND(str1);
10649 kind2 = PyUnicode_KIND(str2);
10650 data1 = PyUnicode_DATA(str1);
10651 data2 = PyUnicode_DATA(str2);
10652 len1 = PyUnicode_GET_LENGTH(str1);
10653 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010654 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010655
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010656 switch(kind1) {
10657 case PyUnicode_1BYTE_KIND:
10658 {
10659 switch(kind2) {
10660 case PyUnicode_1BYTE_KIND:
10661 {
10662 int cmp = memcmp(data1, data2, len);
10663 /* normalize result of memcmp() into the range [-1; 1] */
10664 if (cmp < 0)
10665 return -1;
10666 if (cmp > 0)
10667 return 1;
10668 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010669 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010670 case PyUnicode_2BYTE_KIND:
10671 COMPARE(Py_UCS1, Py_UCS2);
10672 break;
10673 case PyUnicode_4BYTE_KIND:
10674 COMPARE(Py_UCS1, Py_UCS4);
10675 break;
10676 default:
10677 assert(0);
10678 }
10679 break;
10680 }
10681 case PyUnicode_2BYTE_KIND:
10682 {
10683 switch(kind2) {
10684 case PyUnicode_1BYTE_KIND:
10685 COMPARE(Py_UCS2, Py_UCS1);
10686 break;
10687 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010688 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010689 COMPARE(Py_UCS2, Py_UCS2);
10690 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010691 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010692 case PyUnicode_4BYTE_KIND:
10693 COMPARE(Py_UCS2, Py_UCS4);
10694 break;
10695 default:
10696 assert(0);
10697 }
10698 break;
10699 }
10700 case PyUnicode_4BYTE_KIND:
10701 {
10702 switch(kind2) {
10703 case PyUnicode_1BYTE_KIND:
10704 COMPARE(Py_UCS4, Py_UCS1);
10705 break;
10706 case PyUnicode_2BYTE_KIND:
10707 COMPARE(Py_UCS4, Py_UCS2);
10708 break;
10709 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010710 {
10711#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10712 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10713 /* normalize result of wmemcmp() into the range [-1; 1] */
10714 if (cmp < 0)
10715 return -1;
10716 if (cmp > 0)
10717 return 1;
10718#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010719 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010720#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010721 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010722 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010723 default:
10724 assert(0);
10725 }
10726 break;
10727 }
10728 default:
10729 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010730 }
10731
Victor Stinner770e19e2012-10-04 22:59:45 +020010732 if (len1 == len2)
10733 return 0;
10734 if (len1 < len2)
10735 return -1;
10736 else
10737 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010738
10739#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010740}
10741
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010742Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010743unicode_compare_eq(PyObject *str1, PyObject *str2)
10744{
10745 int kind;
10746 void *data1, *data2;
10747 Py_ssize_t len;
10748 int cmp;
10749
Victor Stinnere5567ad2012-10-23 02:48:49 +020010750 len = PyUnicode_GET_LENGTH(str1);
10751 if (PyUnicode_GET_LENGTH(str2) != len)
10752 return 0;
10753 kind = PyUnicode_KIND(str1);
10754 if (PyUnicode_KIND(str2) != kind)
10755 return 0;
10756 data1 = PyUnicode_DATA(str1);
10757 data2 = PyUnicode_DATA(str2);
10758
10759 cmp = memcmp(data1, data2, len * kind);
10760 return (cmp == 0);
10761}
10762
10763
Alexander Belopolsky40018472011-02-26 01:02:56 +000010764int
10765PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10768 if (PyUnicode_READY(left) == -1 ||
10769 PyUnicode_READY(right) == -1)
10770 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010771
10772 /* a string is equal to itself */
10773 if (left == right)
10774 return 0;
10775
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010776 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010778 PyErr_Format(PyExc_TypeError,
10779 "Can't compare %.100s and %.100s",
10780 left->ob_type->tp_name,
10781 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782 return -1;
10783}
10784
Martin v. Löwis5b222132007-06-10 09:51:05 +000010785int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010786_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10787{
10788 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10789 if (right_str == NULL)
10790 return -1;
10791 return PyUnicode_Compare(left, right_str);
10792}
10793
10794int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010795PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 Py_ssize_t i;
10798 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 Py_UCS4 chr;
10800
Victor Stinner910337b2011-10-03 03:20:16 +020010801 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 if (PyUnicode_READY(uni) == -1)
10803 return -1;
10804 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010805 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010806 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010807 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010808 size_t len, len2 = strlen(str);
10809 int cmp;
10810
10811 len = Py_MIN(len1, len2);
10812 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010813 if (cmp != 0) {
10814 if (cmp < 0)
10815 return -1;
10816 else
10817 return 1;
10818 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010819 if (len1 > len2)
10820 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010821 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010822 return -1; /* str is longer */
10823 return 0;
10824 }
10825 else {
10826 void *data = PyUnicode_DATA(uni);
10827 /* Compare Unicode string and source character set string */
10828 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010829 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010830 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10831 /* This check keeps Python strings that end in '\0' from comparing equal
10832 to C strings identical up to that point. */
10833 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10834 return 1; /* uni is longer */
10835 if (str[i])
10836 return -1; /* str is longer */
10837 return 0;
10838 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010839}
10840
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010841
Benjamin Peterson29060642009-01-31 22:14:21 +000010842#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010843 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010844
Alexander Belopolsky40018472011-02-26 01:02:56 +000010845PyObject *
10846PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010847{
10848 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010849 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010850
Victor Stinnere5567ad2012-10-23 02:48:49 +020010851 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10852 Py_RETURN_NOTIMPLEMENTED;
10853
10854 if (PyUnicode_READY(left) == -1 ||
10855 PyUnicode_READY(right) == -1)
10856 return NULL;
10857
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010858 if (left == right) {
10859 switch (op) {
10860 case Py_EQ:
10861 case Py_LE:
10862 case Py_GE:
10863 /* a string is equal to itself */
10864 v = Py_True;
10865 break;
10866 case Py_NE:
10867 case Py_LT:
10868 case Py_GT:
10869 v = Py_False;
10870 break;
10871 default:
10872 PyErr_BadArgument();
10873 return NULL;
10874 }
10875 }
10876 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010877 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010878 result ^= (op == Py_NE);
10879 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010880 }
10881 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010882 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010883
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010884 /* Convert the return value to a Boolean */
10885 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010886 case Py_LE:
10887 v = TEST_COND(result <= 0);
10888 break;
10889 case Py_GE:
10890 v = TEST_COND(result >= 0);
10891 break;
10892 case Py_LT:
10893 v = TEST_COND(result == -1);
10894 break;
10895 case Py_GT:
10896 v = TEST_COND(result == 1);
10897 break;
10898 default:
10899 PyErr_BadArgument();
10900 return NULL;
10901 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010902 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010903 Py_INCREF(v);
10904 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010905}
10906
Alexander Belopolsky40018472011-02-26 01:02:56 +000010907int
10908PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010909{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010910 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010911 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 void *buf1, *buf2;
10913 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010914 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010915
10916 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010917 sub = PyUnicode_FromObject(element);
10918 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010919 PyErr_Format(PyExc_TypeError,
10920 "'in <string>' requires string as left operand, not %s",
10921 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010922 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010923 }
10924
Thomas Wouters477c8d52006-05-27 19:21:47 +000010925 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010926 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927 Py_DECREF(sub);
10928 return -1;
10929 }
10930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 kind1 = PyUnicode_KIND(str);
10932 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010933 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010935 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010936 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 }
10938 len1 = PyUnicode_GET_LENGTH(str);
10939 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010940 if (len1 < len2) {
10941 Py_DECREF(sub);
10942 Py_DECREF(str);
10943 return 0;
10944 }
10945 buf1 = PyUnicode_DATA(str);
10946 buf2 = PyUnicode_DATA(sub);
10947 if (len2 == 1) {
10948 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10949 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10950 Py_DECREF(sub);
10951 Py_DECREF(str);
10952 return result;
10953 }
10954 if (kind2 != kind1) {
10955 buf2 = _PyUnicode_AsKind(sub, kind1);
10956 if (!buf2) {
10957 Py_DECREF(sub);
10958 Py_DECREF(str);
10959 return -1;
10960 }
10961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962
Victor Stinner77282cb2013-04-14 19:22:47 +020010963 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 case PyUnicode_1BYTE_KIND:
10965 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10966 break;
10967 case PyUnicode_2BYTE_KIND:
10968 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10969 break;
10970 case PyUnicode_4BYTE_KIND:
10971 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10972 break;
10973 default:
10974 result = -1;
10975 assert(0);
10976 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010977
10978 Py_DECREF(str);
10979 Py_DECREF(sub);
10980
Victor Stinner77282cb2013-04-14 19:22:47 +020010981 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 PyMem_Free(buf2);
10983
Guido van Rossum403d68b2000-03-13 15:55:09 +000010984 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010985}
10986
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987/* Concat to string or Unicode object giving a new Unicode object. */
10988
Alexander Belopolsky40018472011-02-26 01:02:56 +000010989PyObject *
10990PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010993 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010994 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995
10996 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
11004 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011005 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011006 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011009 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011010 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012 }
11013
Victor Stinner488fa492011-12-12 00:01:39 +010011014 u_len = PyUnicode_GET_LENGTH(u);
11015 v_len = PyUnicode_GET_LENGTH(v);
11016 if (u_len > PY_SSIZE_T_MAX - v_len) {
11017 PyErr_SetString(PyExc_OverflowError,
11018 "strings are too large to concat");
11019 goto onError;
11020 }
11021 new_len = u_len + v_len;
11022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011024 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011025 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011028 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011031 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11032 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033 Py_DECREF(u);
11034 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011035 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037
Benjamin Peterson29060642009-01-31 22:14:21 +000011038 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 Py_XDECREF(u);
11040 Py_XDECREF(v);
11041 return NULL;
11042}
11043
Walter Dörwald1ab83302007-05-18 17:15:44 +000011044void
Victor Stinner23e56682011-10-03 03:54:37 +020011045PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011046{
Victor Stinner23e56682011-10-03 03:54:37 +020011047 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011048 Py_UCS4 maxchar, maxchar2;
11049 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011050
11051 if (p_left == NULL) {
11052 if (!PyErr_Occurred())
11053 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011054 return;
11055 }
Victor Stinner23e56682011-10-03 03:54:37 +020011056 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011057 if (right == NULL || left == NULL
11058 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011059 if (!PyErr_Occurred())
11060 PyErr_BadInternalCall();
11061 goto error;
11062 }
11063
Benjamin Petersonbac79492012-01-14 13:34:47 -050011064 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011065 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011066 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011067 goto error;
11068
Victor Stinner488fa492011-12-12 00:01:39 +010011069 /* Shortcuts */
11070 if (left == unicode_empty) {
11071 Py_DECREF(left);
11072 Py_INCREF(right);
11073 *p_left = right;
11074 return;
11075 }
11076 if (right == unicode_empty)
11077 return;
11078
11079 left_len = PyUnicode_GET_LENGTH(left);
11080 right_len = PyUnicode_GET_LENGTH(right);
11081 if (left_len > PY_SSIZE_T_MAX - right_len) {
11082 PyErr_SetString(PyExc_OverflowError,
11083 "strings are too large to concat");
11084 goto error;
11085 }
11086 new_len = left_len + right_len;
11087
11088 if (unicode_modifiable(left)
11089 && PyUnicode_CheckExact(right)
11090 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011091 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11092 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011093 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011094 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011095 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11096 {
11097 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011098 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011099 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011100
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011101 /* copy 'right' into the newly allocated area of 'left' */
11102 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011103 }
Victor Stinner488fa492011-12-12 00:01:39 +010011104 else {
11105 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11106 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011107 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011108
Victor Stinner488fa492011-12-12 00:01:39 +010011109 /* Concat the two Unicode strings */
11110 res = PyUnicode_New(new_len, maxchar);
11111 if (res == NULL)
11112 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011113 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11114 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011115 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011116 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011117 }
11118 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011119 return;
11120
11121error:
Victor Stinner488fa492011-12-12 00:01:39 +010011122 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011123}
11124
11125void
11126PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11127{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011128 PyUnicode_Append(pleft, right);
11129 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011130}
11131
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011132PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011133 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011135Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011136string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011137interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138
11139static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011140unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011142 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011143 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011144 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011146 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 void *buf1, *buf2;
11148 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149
Jesus Ceaac451502011-04-20 17:09:23 +020011150 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11151 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011152 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 kind1 = PyUnicode_KIND(self);
11155 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011156 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011157 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011158 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 len1 = PyUnicode_GET_LENGTH(self);
11161 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011163 if (end - start < len2) {
11164 Py_DECREF(substring);
11165 return PyLong_FromLong(0);
11166 }
11167 buf1 = PyUnicode_DATA(self);
11168 buf2 = PyUnicode_DATA(substring);
11169 if (kind2 != kind1) {
11170 buf2 = _PyUnicode_AsKind(substring, kind1);
11171 if (!buf2) {
11172 Py_DECREF(substring);
11173 return NULL;
11174 }
11175 }
11176 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 case PyUnicode_1BYTE_KIND:
11178 iresult = ucs1lib_count(
11179 ((Py_UCS1*)buf1) + start, end - start,
11180 buf2, len2, PY_SSIZE_T_MAX
11181 );
11182 break;
11183 case PyUnicode_2BYTE_KIND:
11184 iresult = ucs2lib_count(
11185 ((Py_UCS2*)buf1) + start, end - start,
11186 buf2, len2, PY_SSIZE_T_MAX
11187 );
11188 break;
11189 case PyUnicode_4BYTE_KIND:
11190 iresult = ucs4lib_count(
11191 ((Py_UCS4*)buf1) + start, end - start,
11192 buf2, len2, PY_SSIZE_T_MAX
11193 );
11194 break;
11195 default:
11196 assert(0); iresult = 0;
11197 }
11198
11199 result = PyLong_FromSsize_t(iresult);
11200
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011201 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011202 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
11204 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011205
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 return result;
11207}
11208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011209PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011210 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011212Encode S using the codec registered for encoding. Default encoding\n\
11213is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011214handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011215a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11216'xmlcharrefreplace' as well as any other name registered with\n\
11217codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218
11219static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011220unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011222 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223 char *encoding = NULL;
11224 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011225
Benjamin Peterson308d6372009-09-18 21:42:35 +000011226 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11227 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011229 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011230}
11231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011232PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011233 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234\n\
11235Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011236If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237
11238static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011239unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011241 Py_ssize_t i, j, line_pos, src_len, incr;
11242 Py_UCS4 ch;
11243 PyObject *u;
11244 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011245 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011247 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011248 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249
Ezio Melotti745d54d2013-11-16 19:10:57 +020011250 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11251 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253
Antoine Pitrou22425222011-10-04 19:10:51 +020011254 if (PyUnicode_READY(self) == -1)
11255 return NULL;
11256
Thomas Wouters7e474022000-07-16 12:04:32 +000011257 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011258 src_len = PyUnicode_GET_LENGTH(self);
11259 i = j = line_pos = 0;
11260 kind = PyUnicode_KIND(self);
11261 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011262 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011263 for (; i < src_len; i++) {
11264 ch = PyUnicode_READ(kind, src_data, i);
11265 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011266 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011268 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011270 goto overflow;
11271 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011273 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011276 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011277 goto overflow;
11278 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011280 if (ch == '\n' || ch == '\r')
11281 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011283 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011284 if (!found)
11285 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011286
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011288 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289 if (!u)
11290 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011291 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292
Antoine Pitroue71d5742011-10-04 15:55:09 +020011293 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294
Antoine Pitroue71d5742011-10-04 15:55:09 +020011295 for (; i < src_len; i++) {
11296 ch = PyUnicode_READ(kind, src_data, i);
11297 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011298 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011299 incr = tabsize - (line_pos % tabsize);
11300 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011301 FILL(kind, dest_data, ' ', j, incr);
11302 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011304 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011306 line_pos++;
11307 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011308 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011309 if (ch == '\n' || ch == '\r')
11310 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011312 }
11313 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011314 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011315
Antoine Pitroue71d5742011-10-04 15:55:09 +020011316 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011317 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11318 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319}
11320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011321PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323\n\
11324Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011325such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326arguments start and end are interpreted as in slice notation.\n\
11327\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011328Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329
11330static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011333 /* initialize variables to prevent gcc warning */
11334 PyObject *substring = NULL;
11335 Py_ssize_t start = 0;
11336 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011337 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338
Jesus Ceaac451502011-04-20 17:09:23 +020011339 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11340 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342
Christian Heimesd47802e2013-06-29 21:33:36 +020011343 if (PyUnicode_READY(self) == -1) {
11344 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011346 }
11347 if (PyUnicode_READY(substring) == -1) {
11348 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351
Victor Stinner7931d9a2011-11-04 00:22:48 +010011352 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353
11354 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 if (result == -2)
11357 return NULL;
11358
Christian Heimes217cfd12007-12-02 14:31:20 +000011359 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360}
11361
11362static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011363unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011365 void *data;
11366 enum PyUnicode_Kind kind;
11367 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011368
11369 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11370 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011372 }
11373 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11374 PyErr_SetString(PyExc_IndexError, "string index out of range");
11375 return NULL;
11376 }
11377 kind = PyUnicode_KIND(self);
11378 data = PyUnicode_DATA(self);
11379 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011380 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381}
11382
Guido van Rossumc2504932007-09-18 19:42:40 +000011383/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011384 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011385static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011386unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387{
Guido van Rossumc2504932007-09-18 19:42:40 +000011388 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011389 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011390
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011391#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011392 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011393#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 if (_PyUnicode_HASH(self) != -1)
11395 return _PyUnicode_HASH(self);
11396 if (PyUnicode_READY(self) == -1)
11397 return -1;
11398 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011399 /*
11400 We make the hash of the empty string be 0, rather than using
11401 (prefix ^ suffix), since this slightly obfuscates the hash secret
11402 */
11403 if (len == 0) {
11404 _PyUnicode_HASH(self) = 0;
11405 return 0;
11406 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011407 x = _Py_HashBytes(PyUnicode_DATA(self),
11408 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011410 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411}
11412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011413PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011414 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011416Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417
11418static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011421 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011422 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011423 PyObject *substring = NULL;
11424 Py_ssize_t start = 0;
11425 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426
Jesus Ceaac451502011-04-20 17:09:23 +020011427 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11428 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
Christian Heimesd47a0452013-06-29 21:21:37 +020011431 if (PyUnicode_READY(self) == -1) {
11432 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011434 }
11435 if (PyUnicode_READY(substring) == -1) {
11436 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011438 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439
Victor Stinner7931d9a2011-11-04 00:22:48 +010011440 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
11442 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 if (result == -2)
11445 return NULL;
11446
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447 if (result < 0) {
11448 PyErr_SetString(PyExc_ValueError, "substring not found");
11449 return NULL;
11450 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011451
Christian Heimes217cfd12007-12-02 14:31:20 +000011452 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453}
11454
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011455PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011458Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011459at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
11461static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011462unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 Py_ssize_t i, length;
11465 int kind;
11466 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 int cased;
11468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 if (PyUnicode_READY(self) == -1)
11470 return NULL;
11471 length = PyUnicode_GET_LENGTH(self);
11472 kind = PyUnicode_KIND(self);
11473 data = PyUnicode_DATA(self);
11474
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 if (length == 1)
11477 return PyBool_FromLong(
11478 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011480 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011483
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 for (i = 0; i < length; i++) {
11486 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011487
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11489 return PyBool_FromLong(0);
11490 else if (!cased && Py_UNICODE_ISLOWER(ch))
11491 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011493 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494}
11495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011496PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011497 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011499Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011500at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
11502static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011503unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 Py_ssize_t i, length;
11506 int kind;
11507 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508 int cased;
11509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 if (PyUnicode_READY(self) == -1)
11511 return NULL;
11512 length = PyUnicode_GET_LENGTH(self);
11513 kind = PyUnicode_KIND(self);
11514 data = PyUnicode_DATA(self);
11515
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 if (length == 1)
11518 return PyBool_FromLong(
11519 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011521 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011524
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 for (i = 0; i < length; i++) {
11527 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011528
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11530 return PyBool_FromLong(0);
11531 else if (!cased && Py_UNICODE_ISUPPER(ch))
11532 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011534 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535}
11536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011537PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011540Return True if S is a titlecased string and there is at least one\n\
11541character in S, i.e. upper- and titlecase characters may only\n\
11542follow uncased characters and lowercase characters only cased ones.\n\
11543Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544
11545static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011546unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 Py_ssize_t i, length;
11549 int kind;
11550 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551 int cased, previous_is_cased;
11552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 if (PyUnicode_READY(self) == -1)
11554 return NULL;
11555 length = PyUnicode_GET_LENGTH(self);
11556 kind = PyUnicode_KIND(self);
11557 data = PyUnicode_DATA(self);
11558
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 if (length == 1) {
11561 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11562 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11563 (Py_UNICODE_ISUPPER(ch) != 0));
11564 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011566 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011569
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570 cased = 0;
11571 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 for (i = 0; i < length; i++) {
11573 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011574
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11576 if (previous_is_cased)
11577 return PyBool_FromLong(0);
11578 previous_is_cased = 1;
11579 cased = 1;
11580 }
11581 else if (Py_UNICODE_ISLOWER(ch)) {
11582 if (!previous_is_cased)
11583 return PyBool_FromLong(0);
11584 previous_is_cased = 1;
11585 cased = 1;
11586 }
11587 else
11588 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011590 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591}
11592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011593PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011596Return True if all characters in S are whitespace\n\
11597and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598
11599static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011600unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 Py_ssize_t i, length;
11603 int kind;
11604 void *data;
11605
11606 if (PyUnicode_READY(self) == -1)
11607 return NULL;
11608 length = PyUnicode_GET_LENGTH(self);
11609 kind = PyUnicode_KIND(self);
11610 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 if (length == 1)
11614 return PyBool_FromLong(
11615 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011617 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 for (i = 0; i < length; i++) {
11622 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011623 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011626 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627}
11628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011629PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011630 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011631\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011632Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011633and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011634
11635static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011636unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011637{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 Py_ssize_t i, length;
11639 int kind;
11640 void *data;
11641
11642 if (PyUnicode_READY(self) == -1)
11643 return NULL;
11644 length = PyUnicode_GET_LENGTH(self);
11645 kind = PyUnicode_KIND(self);
11646 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011647
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011648 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 if (length == 1)
11650 return PyBool_FromLong(
11651 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011652
11653 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011655 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 for (i = 0; i < length; i++) {
11658 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011660 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011661 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011662}
11663
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011664PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011665 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011666\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011667Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011668and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011669
11670static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011671unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 int kind;
11674 void *data;
11675 Py_ssize_t len, i;
11676
11677 if (PyUnicode_READY(self) == -1)
11678 return NULL;
11679
11680 kind = PyUnicode_KIND(self);
11681 data = PyUnicode_DATA(self);
11682 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011683
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011684 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 if (len == 1) {
11686 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11687 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11688 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011689
11690 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011692 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 for (i = 0; i < len; i++) {
11695 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011696 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011698 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011699 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011700}
11701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011702PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011703 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011705Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011706False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
11708static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011709unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 Py_ssize_t i, length;
11712 int kind;
11713 void *data;
11714
11715 if (PyUnicode_READY(self) == -1)
11716 return NULL;
11717 length = PyUnicode_GET_LENGTH(self);
11718 kind = PyUnicode_KIND(self);
11719 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 if (length == 1)
11723 return PyBool_FromLong(
11724 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011726 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011728 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 for (i = 0; i < length; i++) {
11731 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011732 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011734 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735}
11736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011737PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011740Return True if all characters in S are digits\n\
11741and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742
11743static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011744unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 Py_ssize_t i, length;
11747 int kind;
11748 void *data;
11749
11750 if (PyUnicode_READY(self) == -1)
11751 return NULL;
11752 length = PyUnicode_GET_LENGTH(self);
11753 kind = PyUnicode_KIND(self);
11754 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755
Guido van Rossumd57fd912000-03-10 22:53:23 +000011756 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 if (length == 1) {
11758 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11759 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11760 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011762 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 for (i = 0; i < length; i++) {
11767 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011770 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771}
11772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011773PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011776Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011777False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
11779static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011780unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 Py_ssize_t i, length;
11783 int kind;
11784 void *data;
11785
11786 if (PyUnicode_READY(self) == -1)
11787 return NULL;
11788 length = PyUnicode_GET_LENGTH(self);
11789 kind = PyUnicode_KIND(self);
11790 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 if (length == 1)
11794 return PyBool_FromLong(
11795 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011797 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011799 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 for (i = 0; i < length; i++) {
11802 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011803 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011805 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806}
11807
Martin v. Löwis47383402007-08-15 07:32:56 +000011808int
11809PyUnicode_IsIdentifier(PyObject *self)
11810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 int kind;
11812 void *data;
11813 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011814 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 if (PyUnicode_READY(self) == -1) {
11817 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 }
11820
11821 /* Special case for empty strings */
11822 if (PyUnicode_GET_LENGTH(self) == 0)
11823 return 0;
11824 kind = PyUnicode_KIND(self);
11825 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011826
11827 /* PEP 3131 says that the first character must be in
11828 XID_Start and subsequent characters in XID_Continue,
11829 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011830 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011831 letters, digits, underscore). However, given the current
11832 definition of XID_Start and XID_Continue, it is sufficient
11833 to check just for these, except that _ must be allowed
11834 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011836 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011837 return 0;
11838
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011839 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011842 return 1;
11843}
11844
11845PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011846 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011847\n\
11848Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011849to the language definition.\n\
11850\n\
11851Use keyword.iskeyword() to test for reserved identifiers\n\
11852such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011853
11854static PyObject*
11855unicode_isidentifier(PyObject *self)
11856{
11857 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11858}
11859
Georg Brandl559e5d72008-06-11 18:37:52 +000011860PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011861 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011862\n\
11863Return True if all characters in S are considered\n\
11864printable in repr() or S is empty, False otherwise.");
11865
11866static PyObject*
11867unicode_isprintable(PyObject *self)
11868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 Py_ssize_t i, length;
11870 int kind;
11871 void *data;
11872
11873 if (PyUnicode_READY(self) == -1)
11874 return NULL;
11875 length = PyUnicode_GET_LENGTH(self);
11876 kind = PyUnicode_KIND(self);
11877 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011878
11879 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 if (length == 1)
11881 return PyBool_FromLong(
11882 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 for (i = 0; i < length; i++) {
11885 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011886 Py_RETURN_FALSE;
11887 }
11888 }
11889 Py_RETURN_TRUE;
11890}
11891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011892PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011893 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894\n\
11895Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011896iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897
11898static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011899unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011901 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902}
11903
Martin v. Löwis18e16552006-02-15 17:27:45 +000011904static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011905unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 if (PyUnicode_READY(self) == -1)
11908 return -1;
11909 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910}
11911
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011912PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011915Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011916done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917
11918static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011919unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011921 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 Py_UCS4 fillchar = ' ';
11923
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011924 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925 return NULL;
11926
Benjamin Petersonbac79492012-01-14 13:34:47 -050011927 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929
Victor Stinnerc4b49542011-12-11 22:44:26 +010011930 if (PyUnicode_GET_LENGTH(self) >= width)
11931 return unicode_result_unchanged(self);
11932
11933 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934}
11935
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011936PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011937 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011939Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940
11941static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011942unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011944 if (PyUnicode_READY(self) == -1)
11945 return NULL;
11946 if (PyUnicode_IS_ASCII(self))
11947 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011948 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949}
11950
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011951#define LEFTSTRIP 0
11952#define RIGHTSTRIP 1
11953#define BOTHSTRIP 2
11954
11955/* Arrays indexed by above */
11956static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11957
11958#define STRIPNAME(i) (stripformat[i]+3)
11959
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011960/* externally visible for str.strip(unicode) */
11961PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011962_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 void *data;
11965 int kind;
11966 Py_ssize_t i, j, len;
11967 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011968 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11971 return NULL;
11972
11973 kind = PyUnicode_KIND(self);
11974 data = PyUnicode_DATA(self);
11975 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011976 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11978 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011979 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011980
Benjamin Peterson14339b62009-01-31 16:36:08 +000011981 i = 0;
11982 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011983 while (i < len) {
11984 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11985 if (!BLOOM(sepmask, ch))
11986 break;
11987 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11988 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 i++;
11990 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011991 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011992
Benjamin Peterson14339b62009-01-31 16:36:08 +000011993 j = len;
11994 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011995 j--;
11996 while (j >= i) {
11997 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11998 if (!BLOOM(sepmask, ch))
11999 break;
12000 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12001 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012003 }
12004
Benjamin Peterson29060642009-01-31 22:14:21 +000012005 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012006 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012007
Victor Stinner7931d9a2011-11-04 00:22:48 +010012008 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009}
12010
12011PyObject*
12012PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12013{
12014 unsigned char *data;
12015 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012016 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017
Victor Stinnerde636f32011-10-01 03:55:54 +020012018 if (PyUnicode_READY(self) == -1)
12019 return NULL;
12020
Victor Stinner684d5fd2012-05-03 02:32:34 +020012021 length = PyUnicode_GET_LENGTH(self);
12022 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012023
Victor Stinner684d5fd2012-05-03 02:32:34 +020012024 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012025 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026
Victor Stinnerde636f32011-10-01 03:55:54 +020012027 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012028 PyErr_SetString(PyExc_IndexError, "string index out of range");
12029 return NULL;
12030 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012031 if (start >= length || end < start)
12032 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012033
Victor Stinner684d5fd2012-05-03 02:32:34 +020012034 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012035 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012036 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012037 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012038 }
12039 else {
12040 kind = PyUnicode_KIND(self);
12041 data = PyUnicode_1BYTE_DATA(self);
12042 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012043 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012044 length);
12045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
12048static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012049do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 Py_ssize_t len, i, j;
12052
12053 if (PyUnicode_READY(self) == -1)
12054 return NULL;
12055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012057
Victor Stinnercc7af722013-04-09 22:39:24 +020012058 if (PyUnicode_IS_ASCII(self)) {
12059 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12060
12061 i = 0;
12062 if (striptype != RIGHTSTRIP) {
12063 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012064 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012065 if (!_Py_ascii_whitespace[ch])
12066 break;
12067 i++;
12068 }
12069 }
12070
12071 j = len;
12072 if (striptype != LEFTSTRIP) {
12073 j--;
12074 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012075 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012076 if (!_Py_ascii_whitespace[ch])
12077 break;
12078 j--;
12079 }
12080 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012081 }
12082 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012083 else {
12084 int kind = PyUnicode_KIND(self);
12085 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012086
Victor Stinnercc7af722013-04-09 22:39:24 +020012087 i = 0;
12088 if (striptype != RIGHTSTRIP) {
12089 while (i < len) {
12090 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12091 if (!Py_UNICODE_ISSPACE(ch))
12092 break;
12093 i++;
12094 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012095 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012096
12097 j = len;
12098 if (striptype != LEFTSTRIP) {
12099 j--;
12100 while (j >= i) {
12101 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12102 if (!Py_UNICODE_ISSPACE(ch))
12103 break;
12104 j--;
12105 }
12106 j++;
12107 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012108 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012109
Victor Stinner7931d9a2011-11-04 00:22:48 +010012110 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111}
12112
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012113
12114static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012115do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012117 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012118
Serhiy Storchakac6792272013-10-19 21:03:34 +030012119 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012120 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012121
Benjamin Peterson14339b62009-01-31 16:36:08 +000012122 if (sep != NULL && sep != Py_None) {
12123 if (PyUnicode_Check(sep))
12124 return _PyUnicode_XStrip(self, striptype, sep);
12125 else {
12126 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 "%s arg must be None or str",
12128 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012129 return NULL;
12130 }
12131 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012132
Benjamin Peterson14339b62009-01-31 16:36:08 +000012133 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012134}
12135
12136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012137PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012138 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139\n\
12140Return a copy of the string S with leading and trailing\n\
12141whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012142If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143
12144static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012145unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012146{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012147 if (PyTuple_GET_SIZE(args) == 0)
12148 return do_strip(self, BOTHSTRIP); /* Common case */
12149 else
12150 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012151}
12152
12153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012154PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012155 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012156\n\
12157Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012158If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012159
12160static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012161unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012162{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012163 if (PyTuple_GET_SIZE(args) == 0)
12164 return do_strip(self, LEFTSTRIP); /* Common case */
12165 else
12166 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012167}
12168
12169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012170PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012171 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012172\n\
12173Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012174If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012175
12176static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012177unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012178{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012179 if (PyTuple_GET_SIZE(args) == 0)
12180 return do_strip(self, RIGHTSTRIP); /* Common case */
12181 else
12182 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012183}
12184
12185
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012187unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012189 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191
Serhiy Storchaka05997252013-01-26 12:14:02 +020012192 if (len < 1)
12193 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194
Victor Stinnerc4b49542011-12-11 22:44:26 +010012195 /* no repeat, return original string */
12196 if (len == 1)
12197 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012198
Benjamin Petersonbac79492012-01-14 13:34:47 -050012199 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 return NULL;
12201
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012202 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012203 PyErr_SetString(PyExc_OverflowError,
12204 "repeated string is too long");
12205 return NULL;
12206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012208
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012209 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210 if (!u)
12211 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012212 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 if (PyUnicode_GET_LENGTH(str) == 1) {
12215 const int kind = PyUnicode_KIND(str);
12216 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012217 if (kind == PyUnicode_1BYTE_KIND) {
12218 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012219 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012220 }
12221 else if (kind == PyUnicode_2BYTE_KIND) {
12222 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012223 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012224 ucs2[n] = fill_char;
12225 } else {
12226 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12227 assert(kind == PyUnicode_4BYTE_KIND);
12228 for (n = 0; n < len; ++n)
12229 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 }
12232 else {
12233 /* number of characters copied this far */
12234 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012235 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 char *to = (char *) PyUnicode_DATA(u);
12237 Py_MEMCPY(to, PyUnicode_DATA(str),
12238 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012239 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 n = (done <= nchars-done) ? done : nchars-done;
12241 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012242 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244 }
12245
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012246 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012247 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248}
12249
Alexander Belopolsky40018472011-02-26 01:02:56 +000012250PyObject *
12251PyUnicode_Replace(PyObject *obj,
12252 PyObject *subobj,
12253 PyObject *replobj,
12254 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255{
12256 PyObject *self;
12257 PyObject *str1;
12258 PyObject *str2;
12259 PyObject *result;
12260
12261 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012262 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012265 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012266 Py_DECREF(self);
12267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268 }
12269 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012270 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012271 Py_DECREF(self);
12272 Py_DECREF(str1);
12273 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012275 if (PyUnicode_READY(self) == -1 ||
12276 PyUnicode_READY(str1) == -1 ||
12277 PyUnicode_READY(str2) == -1)
12278 result = NULL;
12279 else
12280 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281 Py_DECREF(self);
12282 Py_DECREF(str1);
12283 Py_DECREF(str2);
12284 return result;
12285}
12286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012287PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012288 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289\n\
12290Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012291old replaced by new. If the optional argument count is\n\
12292given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293
12294static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 PyObject *str1;
12298 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012299 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300 PyObject *result;
12301
Martin v. Löwis18e16552006-02-15 17:27:45 +000012302 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012304 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012307 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 return NULL;
12309 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012310 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 Py_DECREF(str1);
12312 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012313 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012314 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12315 result = NULL;
12316 else
12317 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318
12319 Py_DECREF(str1);
12320 Py_DECREF(str2);
12321 return result;
12322}
12323
Alexander Belopolsky40018472011-02-26 01:02:56 +000012324static PyObject *
12325unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012326{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012327 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 Py_ssize_t isize;
12329 Py_ssize_t osize, squote, dquote, i, o;
12330 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012331 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012335 return NULL;
12336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 isize = PyUnicode_GET_LENGTH(unicode);
12338 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 /* Compute length of output, quote characters, and
12341 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012342 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 max = 127;
12344 squote = dquote = 0;
12345 ikind = PyUnicode_KIND(unicode);
12346 for (i = 0; i < isize; i++) {
12347 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012348 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012350 case '\'': squote++; break;
12351 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012353 incr = 2;
12354 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 default:
12356 /* Fast-path ASCII */
12357 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012358 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012360 ;
12361 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012364 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012366 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012368 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012370 if (osize > PY_SSIZE_T_MAX - incr) {
12371 PyErr_SetString(PyExc_OverflowError,
12372 "string is too long to generate repr");
12373 return NULL;
12374 }
12375 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 }
12377
12378 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012379 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012381 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 if (dquote)
12383 /* Both squote and dquote present. Use squote,
12384 and escape them */
12385 osize += squote;
12386 else
12387 quote = '"';
12388 }
Victor Stinner55c08782013-04-14 18:45:39 +020012389 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390
12391 repr = PyUnicode_New(osize, max);
12392 if (repr == NULL)
12393 return NULL;
12394 okind = PyUnicode_KIND(repr);
12395 odata = PyUnicode_DATA(repr);
12396
12397 PyUnicode_WRITE(okind, odata, 0, quote);
12398 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012399 if (unchanged) {
12400 _PyUnicode_FastCopyCharacters(repr, 1,
12401 unicode, 0,
12402 isize);
12403 }
12404 else {
12405 for (i = 0, o = 1; i < isize; i++) {
12406 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407
Victor Stinner55c08782013-04-14 18:45:39 +020012408 /* Escape quotes and backslashes */
12409 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012410 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012411 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012412 continue;
12413 }
12414
12415 /* Map special whitespace to '\t', \n', '\r' */
12416 if (ch == '\t') {
12417 PyUnicode_WRITE(okind, odata, o++, '\\');
12418 PyUnicode_WRITE(okind, odata, o++, 't');
12419 }
12420 else if (ch == '\n') {
12421 PyUnicode_WRITE(okind, odata, o++, '\\');
12422 PyUnicode_WRITE(okind, odata, o++, 'n');
12423 }
12424 else if (ch == '\r') {
12425 PyUnicode_WRITE(okind, odata, o++, '\\');
12426 PyUnicode_WRITE(okind, odata, o++, 'r');
12427 }
12428
12429 /* Map non-printable US ASCII to '\xhh' */
12430 else if (ch < ' ' || ch == 0x7F) {
12431 PyUnicode_WRITE(okind, odata, o++, '\\');
12432 PyUnicode_WRITE(okind, odata, o++, 'x');
12433 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12434 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12435 }
12436
12437 /* Copy ASCII characters as-is */
12438 else if (ch < 0x7F) {
12439 PyUnicode_WRITE(okind, odata, o++, ch);
12440 }
12441
12442 /* Non-ASCII characters */
12443 else {
12444 /* Map Unicode whitespace and control characters
12445 (categories Z* and C* except ASCII space)
12446 */
12447 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12448 PyUnicode_WRITE(okind, odata, o++, '\\');
12449 /* Map 8-bit characters to '\xhh' */
12450 if (ch <= 0xff) {
12451 PyUnicode_WRITE(okind, odata, o++, 'x');
12452 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12454 }
12455 /* Map 16-bit characters to '\uxxxx' */
12456 else if (ch <= 0xffff) {
12457 PyUnicode_WRITE(okind, odata, o++, 'u');
12458 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12459 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12460 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12461 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12462 }
12463 /* Map 21-bit characters to '\U00xxxxxx' */
12464 else {
12465 PyUnicode_WRITE(okind, odata, o++, 'U');
12466 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12467 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12468 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12469 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12470 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12471 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12472 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12473 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12474 }
12475 }
12476 /* Copy characters as-is */
12477 else {
12478 PyUnicode_WRITE(okind, odata, o++, ch);
12479 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012480 }
12481 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012484 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012485 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486}
12487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012488PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012489 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490\n\
12491Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012492such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493arguments start and end are interpreted as in slice notation.\n\
12494\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012495Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496
12497static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012500 /* initialize variables to prevent gcc warning */
12501 PyObject *substring = NULL;
12502 Py_ssize_t start = 0;
12503 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012504 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505
Jesus Ceaac451502011-04-20 17:09:23 +020012506 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12507 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509
Christian Heimesea71a522013-06-29 21:17:34 +020012510 if (PyUnicode_READY(self) == -1) {
12511 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012513 }
12514 if (PyUnicode_READY(substring) == -1) {
12515 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012518
Victor Stinner7931d9a2011-11-04 00:22:48 +010012519 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520
12521 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 if (result == -2)
12524 return NULL;
12525
Christian Heimes217cfd12007-12-02 14:31:20 +000012526 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527}
12528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012529PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012532Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533
12534static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012537 /* initialize variables to prevent gcc warning */
12538 PyObject *substring = NULL;
12539 Py_ssize_t start = 0;
12540 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012541 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542
Jesus Ceaac451502011-04-20 17:09:23 +020012543 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12544 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546
Christian Heimesea71a522013-06-29 21:17:34 +020012547 if (PyUnicode_READY(self) == -1) {
12548 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012550 }
12551 if (PyUnicode_READY(substring) == -1) {
12552 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555
Victor Stinner7931d9a2011-11-04 00:22:48 +010012556 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557
12558 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560 if (result == -2)
12561 return NULL;
12562
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563 if (result < 0) {
12564 PyErr_SetString(PyExc_ValueError, "substring not found");
12565 return NULL;
12566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567
Christian Heimes217cfd12007-12-02 14:31:20 +000012568 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569}
12570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012571PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012574Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012575done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576
12577static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012578unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012580 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 Py_UCS4 fillchar = ' ';
12582
Victor Stinnere9a29352011-10-01 02:14:59 +020012583 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012585
Benjamin Petersonbac79492012-01-14 13:34:47 -050012586 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587 return NULL;
12588
Victor Stinnerc4b49542011-12-11 22:44:26 +010012589 if (PyUnicode_GET_LENGTH(self) >= width)
12590 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591
Victor Stinnerc4b49542011-12-11 22:44:26 +010012592 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593}
12594
Alexander Belopolsky40018472011-02-26 01:02:56 +000012595PyObject *
12596PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597{
12598 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012599
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600 s = PyUnicode_FromObject(s);
12601 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012602 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 if (sep != NULL) {
12604 sep = PyUnicode_FromObject(sep);
12605 if (sep == NULL) {
12606 Py_DECREF(s);
12607 return NULL;
12608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609 }
12610
Victor Stinner9310abb2011-10-05 00:59:23 +020012611 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612
12613 Py_DECREF(s);
12614 Py_XDECREF(sep);
12615 return result;
12616}
12617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012618PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012619 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620\n\
12621Return a list of the words in S, using sep as the\n\
12622delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012623splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012624whitespace string is a separator and empty strings are\n\
12625removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626
12627static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012628unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012630 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012632 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012634 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12635 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 return NULL;
12637
12638 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012641 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012643 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644}
12645
Thomas Wouters477c8d52006-05-27 19:21:47 +000012646PyObject *
12647PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12648{
12649 PyObject* str_obj;
12650 PyObject* sep_obj;
12651 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012652 int kind1, kind2;
12653 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012655
12656 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012657 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012658 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012659 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012660 if (!sep_obj) {
12661 Py_DECREF(str_obj);
12662 return NULL;
12663 }
12664 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12665 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012666 Py_DECREF(str_obj);
12667 return NULL;
12668 }
12669
Victor Stinner14f8f022011-10-05 20:58:25 +020012670 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 len1 = PyUnicode_GET_LENGTH(str_obj);
12673 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012674 if (kind1 < kind2 || len1 < len2) {
12675 _Py_INCREF_UNICODE_EMPTY();
12676 if (!unicode_empty)
12677 out = NULL;
12678 else {
12679 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12680 Py_DECREF(unicode_empty);
12681 }
12682 Py_DECREF(sep_obj);
12683 Py_DECREF(str_obj);
12684 return out;
12685 }
12686 buf1 = PyUnicode_DATA(str_obj);
12687 buf2 = PyUnicode_DATA(sep_obj);
12688 if (kind2 != kind1) {
12689 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12690 if (!buf2)
12691 goto onError;
12692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012693
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012694 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012696 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12697 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12698 else
12699 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 break;
12701 case PyUnicode_2BYTE_KIND:
12702 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12703 break;
12704 case PyUnicode_4BYTE_KIND:
12705 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12706 break;
12707 default:
12708 assert(0);
12709 out = 0;
12710 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711
12712 Py_DECREF(sep_obj);
12713 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012714 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012716
12717 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012718 onError:
12719 Py_DECREF(sep_obj);
12720 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012721 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722 PyMem_Free(buf2);
12723 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012724}
12725
12726
12727PyObject *
12728PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12729{
12730 PyObject* str_obj;
12731 PyObject* sep_obj;
12732 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012733 int kind1, kind2;
12734 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012736
12737 str_obj = PyUnicode_FromObject(str_in);
12738 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012739 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012740 sep_obj = PyUnicode_FromObject(sep_in);
12741 if (!sep_obj) {
12742 Py_DECREF(str_obj);
12743 return NULL;
12744 }
12745
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012746 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748 len1 = PyUnicode_GET_LENGTH(str_obj);
12749 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012750 if (kind1 < kind2 || len1 < len2) {
12751 _Py_INCREF_UNICODE_EMPTY();
12752 if (!unicode_empty)
12753 out = NULL;
12754 else {
12755 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12756 Py_DECREF(unicode_empty);
12757 }
12758 Py_DECREF(sep_obj);
12759 Py_DECREF(str_obj);
12760 return out;
12761 }
12762 buf1 = PyUnicode_DATA(str_obj);
12763 buf2 = PyUnicode_DATA(sep_obj);
12764 if (kind2 != kind1) {
12765 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12766 if (!buf2)
12767 goto onError;
12768 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012770 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012772 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12773 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12774 else
12775 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012776 break;
12777 case PyUnicode_2BYTE_KIND:
12778 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12779 break;
12780 case PyUnicode_4BYTE_KIND:
12781 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12782 break;
12783 default:
12784 assert(0);
12785 out = 0;
12786 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012787
12788 Py_DECREF(sep_obj);
12789 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012790 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012792
12793 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794 onError:
12795 Py_DECREF(sep_obj);
12796 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012797 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 PyMem_Free(buf2);
12799 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012800}
12801
12802PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012804\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012805Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012806the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012807found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012808
12809static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012810unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012811{
Victor Stinner9310abb2011-10-05 00:59:23 +020012812 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813}
12814
12815PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012816 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012817\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012818Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012819the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012820separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012821
12822static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012823unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012824{
Victor Stinner9310abb2011-10-05 00:59:23 +020012825 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012826}
12827
Alexander Belopolsky40018472011-02-26 01:02:56 +000012828PyObject *
12829PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012830{
12831 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012832
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012833 s = PyUnicode_FromObject(s);
12834 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012835 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012836 if (sep != NULL) {
12837 sep = PyUnicode_FromObject(sep);
12838 if (sep == NULL) {
12839 Py_DECREF(s);
12840 return NULL;
12841 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012842 }
12843
Victor Stinner9310abb2011-10-05 00:59:23 +020012844 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012845
12846 Py_DECREF(s);
12847 Py_XDECREF(sep);
12848 return result;
12849}
12850
12851PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012852 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012853\n\
12854Return a list of the words in S, using sep as the\n\
12855delimiter string, starting at the end of the string and\n\
12856working to the front. If maxsplit is given, at most maxsplit\n\
12857splits are done. If sep is not specified, any whitespace string\n\
12858is a separator.");
12859
12860static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012861unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012862{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012863 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012864 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012865 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012866
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012867 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12868 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012869 return NULL;
12870
12871 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012873 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012874 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012875 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012876 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012877}
12878
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012879PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012880 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881\n\
12882Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012883Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012884is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885
12886static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012887unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012889 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012890 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012892 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12893 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894 return NULL;
12895
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012896 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897}
12898
12899static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012900PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012902 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903}
12904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012905PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012906 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907\n\
12908Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012909and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910
12911static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012912unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012913{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012914 if (PyUnicode_READY(self) == -1)
12915 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012916 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012917}
12918
Larry Hastings61272b72014-01-07 12:41:53 -080012919/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012920
Larry Hastings31826802013-10-19 00:09:25 -070012921@staticmethod
12922str.maketrans as unicode_maketrans
12923
12924 x: object
12925
12926 y: unicode=NULL
12927
12928 z: unicode=NULL
12929
12930 /
12931
12932Return a translation table usable for str.translate().
12933
12934If there is only one argument, it must be a dictionary mapping Unicode
12935ordinals (integers) or characters to Unicode ordinals, strings or None.
12936Character keys will be then converted to ordinals.
12937If there are two arguments, they must be strings of equal length, and
12938in the resulting dictionary, each character in x will be mapped to the
12939character at the same position in y. If there is a third argument, it
12940must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012941[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012942
Larry Hastings31826802013-10-19 00:09:25 -070012943static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012944unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012945/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012946{
Georg Brandlceee0772007-11-27 23:48:05 +000012947 PyObject *new = NULL, *key, *value;
12948 Py_ssize_t i = 0;
12949 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012950
Georg Brandlceee0772007-11-27 23:48:05 +000012951 new = PyDict_New();
12952 if (!new)
12953 return NULL;
12954 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012955 int x_kind, y_kind, z_kind;
12956 void *x_data, *y_data, *z_data;
12957
Georg Brandlceee0772007-11-27 23:48:05 +000012958 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012959 if (!PyUnicode_Check(x)) {
12960 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12961 "be a string if there is a second argument");
12962 goto err;
12963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012965 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12966 "arguments must have equal length");
12967 goto err;
12968 }
12969 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 x_kind = PyUnicode_KIND(x);
12971 y_kind = PyUnicode_KIND(y);
12972 x_data = PyUnicode_DATA(x);
12973 y_data = PyUnicode_DATA(y);
12974 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12975 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012976 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012977 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012978 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012979 if (!value) {
12980 Py_DECREF(key);
12981 goto err;
12982 }
Georg Brandlceee0772007-11-27 23:48:05 +000012983 res = PyDict_SetItem(new, key, value);
12984 Py_DECREF(key);
12985 Py_DECREF(value);
12986 if (res < 0)
12987 goto err;
12988 }
12989 /* create entries for deleting chars in z */
12990 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 z_kind = PyUnicode_KIND(z);
12992 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012993 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012995 if (!key)
12996 goto err;
12997 res = PyDict_SetItem(new, key, Py_None);
12998 Py_DECREF(key);
12999 if (res < 0)
13000 goto err;
13001 }
13002 }
13003 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 int kind;
13005 void *data;
13006
Georg Brandlceee0772007-11-27 23:48:05 +000013007 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013008 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013009 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13010 "to maketrans it must be a dict");
13011 goto err;
13012 }
13013 /* copy entries into the new dict, converting string keys to int keys */
13014 while (PyDict_Next(x, &i, &key, &value)) {
13015 if (PyUnicode_Check(key)) {
13016 /* convert string keys to integer keys */
13017 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013018 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013019 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13020 "table must be of length 1");
13021 goto err;
13022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023 kind = PyUnicode_KIND(key);
13024 data = PyUnicode_DATA(key);
13025 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013026 if (!newkey)
13027 goto err;
13028 res = PyDict_SetItem(new, newkey, value);
13029 Py_DECREF(newkey);
13030 if (res < 0)
13031 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013032 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013033 /* just keep integer keys */
13034 if (PyDict_SetItem(new, key, value) < 0)
13035 goto err;
13036 } else {
13037 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13038 "be strings or integers");
13039 goto err;
13040 }
13041 }
13042 }
13043 return new;
13044 err:
13045 Py_DECREF(new);
13046 return NULL;
13047}
13048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013049PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013050 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013052Return a copy of the string S in which each character has been mapped\n\
13053through the given translation table. The table must implement\n\
13054lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13055mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13056this operation raises LookupError, the character is left untouched.\n\
13057Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058
13059static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013060unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013062 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063}
13064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013065PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013066 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013068Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069
13070static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013071unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013072{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013073 if (PyUnicode_READY(self) == -1)
13074 return NULL;
13075 if (PyUnicode_IS_ASCII(self))
13076 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013077 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078}
13079
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013080PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013081 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013083Pad a numeric string S with zeros on the left, to fill a field\n\
13084of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085
13086static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013087unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013089 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013090 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013091 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092 int kind;
13093 void *data;
13094 Py_UCS4 chr;
13095
Martin v. Löwis18e16552006-02-15 17:27:45 +000013096 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097 return NULL;
13098
Benjamin Petersonbac79492012-01-14 13:34:47 -050013099 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101
Victor Stinnerc4b49542011-12-11 22:44:26 +010013102 if (PyUnicode_GET_LENGTH(self) >= width)
13103 return unicode_result_unchanged(self);
13104
13105 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106
13107 u = pad(self, fill, 0, '0');
13108
Walter Dörwald068325e2002-04-15 13:36:47 +000013109 if (u == NULL)
13110 return NULL;
13111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 kind = PyUnicode_KIND(u);
13113 data = PyUnicode_DATA(u);
13114 chr = PyUnicode_READ(kind, data, fill);
13115
13116 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 PyUnicode_WRITE(kind, data, 0, chr);
13119 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120 }
13121
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013122 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013123 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125
13126#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013127static PyObject *
13128unicode__decimal2ascii(PyObject *self)
13129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013131}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132#endif
13133
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013134PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013137Return True if S starts with the specified prefix, False otherwise.\n\
13138With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013139With optional end, stop comparing S at that position.\n\
13140prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141
13142static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013143unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013144 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013146 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013147 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013148 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013149 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013150 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151
Jesus Ceaac451502011-04-20 17:09:23 +020013152 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013153 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013154 if (PyTuple_Check(subobj)) {
13155 Py_ssize_t i;
13156 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013157 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013158 if (substring == NULL)
13159 return NULL;
13160 result = tailmatch(self, substring, start, end, -1);
13161 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013162 if (result == -1)
13163 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013164 if (result) {
13165 Py_RETURN_TRUE;
13166 }
13167 }
13168 /* nothing matched */
13169 Py_RETURN_FALSE;
13170 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013171 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013172 if (substring == NULL) {
13173 if (PyErr_ExceptionMatches(PyExc_TypeError))
13174 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13175 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013176 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013177 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013178 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013180 if (result == -1)
13181 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013182 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183}
13184
13185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013186PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013187 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013188\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013189Return True if S ends with the specified suffix, False otherwise.\n\
13190With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013191With optional end, stop comparing S at that position.\n\
13192suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013193
13194static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013195unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013196 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013198 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013199 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013200 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013201 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013202 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203
Jesus Ceaac451502011-04-20 17:09:23 +020013204 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013205 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013206 if (PyTuple_Check(subobj)) {
13207 Py_ssize_t i;
13208 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013209 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013211 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013213 result = tailmatch(self, substring, start, end, +1);
13214 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013215 if (result == -1)
13216 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013217 if (result) {
13218 Py_RETURN_TRUE;
13219 }
13220 }
13221 Py_RETURN_FALSE;
13222 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013223 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013224 if (substring == NULL) {
13225 if (PyErr_ExceptionMatches(PyExc_TypeError))
13226 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13227 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013228 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013229 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013230 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013231 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013232 if (result == -1)
13233 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013234 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235}
13236
Victor Stinner202fdca2012-05-07 12:47:02 +020013237Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013238_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013239{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013240 if (!writer->readonly)
13241 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13242 else {
13243 /* Copy-on-write mode: set buffer size to 0 so
13244 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13245 * next write. */
13246 writer->size = 0;
13247 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013248 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13249 writer->data = PyUnicode_DATA(writer->buffer);
13250 writer->kind = PyUnicode_KIND(writer->buffer);
13251}
13252
Victor Stinnerd3f08822012-05-29 12:57:52 +020013253void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013254_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013255{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013256 memset(writer, 0, sizeof(*writer));
13257#ifdef Py_DEBUG
13258 writer->kind = 5; /* invalid kind */
13259#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013260 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013261}
13262
Victor Stinnerd3f08822012-05-29 12:57:52 +020013263int
13264_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13265 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013266{
Victor Stinner6989ba02013-11-18 21:08:39 +010013267#ifdef MS_WINDOWS
13268 /* On Windows, overallocate by 50% is the best factor */
13269# define OVERALLOCATE_FACTOR 2
13270#else
13271 /* On Linux, overallocate by 25% is the best factor */
13272# define OVERALLOCATE_FACTOR 4
13273#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013274 Py_ssize_t newlen;
13275 PyObject *newbuffer;
13276
Victor Stinnerd3f08822012-05-29 12:57:52 +020013277 assert(length > 0);
13278
Victor Stinner202fdca2012-05-07 12:47:02 +020013279 if (length > PY_SSIZE_T_MAX - writer->pos) {
13280 PyErr_NoMemory();
13281 return -1;
13282 }
13283 newlen = writer->pos + length;
13284
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013285 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013286
Victor Stinnerd3f08822012-05-29 12:57:52 +020013287 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013288 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013289 if (writer->overallocate
13290 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13291 /* overallocate to limit the number of realloc() */
13292 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013293 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013294 if (newlen < writer->min_length)
13295 newlen = writer->min_length;
13296
Victor Stinnerd3f08822012-05-29 12:57:52 +020013297 writer->buffer = PyUnicode_New(newlen, maxchar);
13298 if (writer->buffer == NULL)
13299 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013300 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013301 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013302 if (writer->overallocate
13303 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13304 /* overallocate to limit the number of realloc() */
13305 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013306 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013307 if (newlen < writer->min_length)
13308 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013309
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013310 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013311 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013312 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013313 newbuffer = PyUnicode_New(newlen, maxchar);
13314 if (newbuffer == NULL)
13315 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013316 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13317 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013318 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013319 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013320 }
13321 else {
13322 newbuffer = resize_compact(writer->buffer, newlen);
13323 if (newbuffer == NULL)
13324 return -1;
13325 }
13326 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013327 }
13328 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013329 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013330 newbuffer = PyUnicode_New(writer->size, maxchar);
13331 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013332 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013333 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13334 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013335 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013336 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013337 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013338 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013339
13340#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013341}
13342
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013343Py_LOCAL_INLINE(int)
13344_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013345{
13346 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13347 return -1;
13348 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13349 writer->pos++;
13350 return 0;
13351}
13352
13353int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013354_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13355{
13356 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13357}
13358
13359int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013360_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13361{
13362 Py_UCS4 maxchar;
13363 Py_ssize_t len;
13364
13365 if (PyUnicode_READY(str) == -1)
13366 return -1;
13367 len = PyUnicode_GET_LENGTH(str);
13368 if (len == 0)
13369 return 0;
13370 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13371 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013372 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013373 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013374 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013375 Py_INCREF(str);
13376 writer->buffer = str;
13377 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013378 writer->pos += len;
13379 return 0;
13380 }
13381 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13382 return -1;
13383 }
13384 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13385 str, 0, len);
13386 writer->pos += len;
13387 return 0;
13388}
13389
Victor Stinnere215d962012-10-06 23:03:36 +020013390int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013391_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13392 Py_ssize_t start, Py_ssize_t end)
13393{
13394 Py_UCS4 maxchar;
13395 Py_ssize_t len;
13396
13397 if (PyUnicode_READY(str) == -1)
13398 return -1;
13399
13400 assert(0 <= start);
13401 assert(end <= PyUnicode_GET_LENGTH(str));
13402 assert(start <= end);
13403
13404 if (end == 0)
13405 return 0;
13406
13407 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13408 return _PyUnicodeWriter_WriteStr(writer, str);
13409
13410 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13411 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13412 else
13413 maxchar = writer->maxchar;
13414 len = end - start;
13415
13416 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13417 return -1;
13418
13419 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13420 str, start, len);
13421 writer->pos += len;
13422 return 0;
13423}
13424
13425int
Victor Stinner4a587072013-11-19 12:54:53 +010013426_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13427 const char *ascii, Py_ssize_t len)
13428{
13429 if (len == -1)
13430 len = strlen(ascii);
13431
13432 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13433
13434 if (writer->buffer == NULL && !writer->overallocate) {
13435 PyObject *str;
13436
13437 str = _PyUnicode_FromASCII(ascii, len);
13438 if (str == NULL)
13439 return -1;
13440
13441 writer->readonly = 1;
13442 writer->buffer = str;
13443 _PyUnicodeWriter_Update(writer);
13444 writer->pos += len;
13445 return 0;
13446 }
13447
13448 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13449 return -1;
13450
13451 switch (writer->kind)
13452 {
13453 case PyUnicode_1BYTE_KIND:
13454 {
13455 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13456 Py_UCS1 *data = writer->data;
13457
13458 Py_MEMCPY(data + writer->pos, str, len);
13459 break;
13460 }
13461 case PyUnicode_2BYTE_KIND:
13462 {
13463 _PyUnicode_CONVERT_BYTES(
13464 Py_UCS1, Py_UCS2,
13465 ascii, ascii + len,
13466 (Py_UCS2 *)writer->data + writer->pos);
13467 break;
13468 }
13469 case PyUnicode_4BYTE_KIND:
13470 {
13471 _PyUnicode_CONVERT_BYTES(
13472 Py_UCS1, Py_UCS4,
13473 ascii, ascii + len,
13474 (Py_UCS4 *)writer->data + writer->pos);
13475 break;
13476 }
13477 default:
13478 assert(0);
13479 }
13480
13481 writer->pos += len;
13482 return 0;
13483}
13484
13485int
13486_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13487 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013488{
13489 Py_UCS4 maxchar;
13490
13491 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13492 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13493 return -1;
13494 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13495 writer->pos += len;
13496 return 0;
13497}
13498
Victor Stinnerd3f08822012-05-29 12:57:52 +020013499PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013500_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013501{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013502 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013503 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013504 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013505 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013506 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013507 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013508 str = writer->buffer;
13509 writer->buffer = NULL;
13510 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13511 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013512 }
13513 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13514 PyObject *newbuffer;
13515 newbuffer = resize_compact(writer->buffer, writer->pos);
13516 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013517 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013518 return NULL;
13519 }
13520 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013521 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013522 str = writer->buffer;
13523 writer->buffer = NULL;
13524 assert(_PyUnicode_CheckConsistency(str, 1));
13525 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013526}
13527
Victor Stinnerd3f08822012-05-29 12:57:52 +020013528void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013529_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013530{
13531 Py_CLEAR(writer->buffer);
13532}
13533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013534#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013535
13536PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013537 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013538\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013539Return a formatted version of S, using substitutions from args and kwargs.\n\
13540The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013541
Eric Smith27bbca62010-11-04 17:06:58 +000013542PyDoc_STRVAR(format_map__doc__,
13543 "S.format_map(mapping) -> str\n\
13544\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013545Return a formatted version of S, using substitutions from mapping.\n\
13546The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013547
Eric Smith4a7d76d2008-05-30 18:10:19 +000013548static PyObject *
13549unicode__format__(PyObject* self, PyObject* args)
13550{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013551 PyObject *format_spec;
13552 _PyUnicodeWriter writer;
13553 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013554
13555 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13556 return NULL;
13557
Victor Stinnerd3f08822012-05-29 12:57:52 +020013558 if (PyUnicode_READY(self) == -1)
13559 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013560 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013561 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13562 self, format_spec, 0,
13563 PyUnicode_GET_LENGTH(format_spec));
13564 if (ret == -1) {
13565 _PyUnicodeWriter_Dealloc(&writer);
13566 return NULL;
13567 }
13568 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013569}
13570
Eric Smith8c663262007-08-25 02:26:07 +000013571PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013572 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013573\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013574Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013575
13576static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013577unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013578{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013579 Py_ssize_t size;
13580
13581 /* If it's a compact object, account for base structure +
13582 character data. */
13583 if (PyUnicode_IS_COMPACT_ASCII(v))
13584 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13585 else if (PyUnicode_IS_COMPACT(v))
13586 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013587 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013588 else {
13589 /* If it is a two-block object, account for base object, and
13590 for character block if present. */
13591 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013592 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013593 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013594 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013595 }
13596 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013597 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013598 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013599 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013600 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013601 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602
13603 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013604}
13605
13606PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013608
13609static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013610unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013611{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013612 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613 if (!copy)
13614 return NULL;
13615 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013616}
13617
Guido van Rossumd57fd912000-03-10 22:53:23 +000013618static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013619 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013620 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013621 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13622 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013623 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13624 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013625 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013626 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13627 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13628 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013629 {"expandtabs", (PyCFunction) unicode_expandtabs,
13630 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013631 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013632 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013633 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13634 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13635 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013636 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013637 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13638 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13639 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013640 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013641 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013642 {"splitlines", (PyCFunction) unicode_splitlines,
13643 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013644 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013645 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13646 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13647 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13648 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13649 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13650 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13651 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13652 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13653 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13654 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13655 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13656 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13657 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13658 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013659 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013660 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013661 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013662 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013663 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013664 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013665 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013666 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013667#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013668 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013669 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013670#endif
13671
Benjamin Peterson14339b62009-01-31 16:36:08 +000013672 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013673 {NULL, NULL}
13674};
13675
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013676static PyObject *
13677unicode_mod(PyObject *v, PyObject *w)
13678{
Brian Curtindfc80e32011-08-10 20:28:54 -050013679 if (!PyUnicode_Check(v))
13680 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013682}
13683
13684static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013685 0, /*nb_add*/
13686 0, /*nb_subtract*/
13687 0, /*nb_multiply*/
13688 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013689};
13690
Guido van Rossumd57fd912000-03-10 22:53:23 +000013691static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013692 (lenfunc) unicode_length, /* sq_length */
13693 PyUnicode_Concat, /* sq_concat */
13694 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13695 (ssizeargfunc) unicode_getitem, /* sq_item */
13696 0, /* sq_slice */
13697 0, /* sq_ass_item */
13698 0, /* sq_ass_slice */
13699 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013700};
13701
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013702static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013703unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013704{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013705 if (PyUnicode_READY(self) == -1)
13706 return NULL;
13707
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013708 if (PyIndex_Check(item)) {
13709 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013710 if (i == -1 && PyErr_Occurred())
13711 return NULL;
13712 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013713 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013714 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013715 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013716 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013717 PyObject *result;
13718 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013719 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013720 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013722 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013723 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013724 return NULL;
13725 }
13726
13727 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013728 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013729 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013730 slicelength == PyUnicode_GET_LENGTH(self)) {
13731 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013732 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013733 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013734 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013735 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013736 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013737 src_kind = PyUnicode_KIND(self);
13738 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013739 if (!PyUnicode_IS_ASCII(self)) {
13740 kind_limit = kind_maxchar_limit(src_kind);
13741 max_char = 0;
13742 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13743 ch = PyUnicode_READ(src_kind, src_data, cur);
13744 if (ch > max_char) {
13745 max_char = ch;
13746 if (max_char >= kind_limit)
13747 break;
13748 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013749 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013750 }
Victor Stinner55c99112011-10-13 01:17:06 +020013751 else
13752 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013753 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013754 if (result == NULL)
13755 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013756 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013757 dest_data = PyUnicode_DATA(result);
13758
13759 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013760 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13761 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013762 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013763 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013764 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013765 } else {
13766 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13767 return NULL;
13768 }
13769}
13770
13771static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013772 (lenfunc)unicode_length, /* mp_length */
13773 (binaryfunc)unicode_subscript, /* mp_subscript */
13774 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013775};
13776
Guido van Rossumd57fd912000-03-10 22:53:23 +000013777
Guido van Rossumd57fd912000-03-10 22:53:23 +000013778/* Helpers for PyUnicode_Format() */
13779
Victor Stinnera47082312012-10-04 02:19:54 +020013780struct unicode_formatter_t {
13781 PyObject *args;
13782 int args_owned;
13783 Py_ssize_t arglen, argidx;
13784 PyObject *dict;
13785
13786 enum PyUnicode_Kind fmtkind;
13787 Py_ssize_t fmtcnt, fmtpos;
13788 void *fmtdata;
13789 PyObject *fmtstr;
13790
13791 _PyUnicodeWriter writer;
13792};
13793
13794struct unicode_format_arg_t {
13795 Py_UCS4 ch;
13796 int flags;
13797 Py_ssize_t width;
13798 int prec;
13799 int sign;
13800};
13801
Guido van Rossumd57fd912000-03-10 22:53:23 +000013802static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013803unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013804{
Victor Stinnera47082312012-10-04 02:19:54 +020013805 Py_ssize_t argidx = ctx->argidx;
13806
13807 if (argidx < ctx->arglen) {
13808 ctx->argidx++;
13809 if (ctx->arglen < 0)
13810 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013811 else
Victor Stinnera47082312012-10-04 02:19:54 +020013812 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013813 }
13814 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013815 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013816 return NULL;
13817}
13818
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013819/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820
Victor Stinnera47082312012-10-04 02:19:54 +020013821/* Format a float into the writer if the writer is not NULL, or into *p_output
13822 otherwise.
13823
13824 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013825static int
Victor Stinnera47082312012-10-04 02:19:54 +020013826formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13827 PyObject **p_output,
13828 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013829{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013830 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013831 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013832 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013833 int prec;
13834 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013835
Guido van Rossumd57fd912000-03-10 22:53:23 +000013836 x = PyFloat_AsDouble(v);
13837 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013838 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013839
Victor Stinnera47082312012-10-04 02:19:54 +020013840 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013841 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013842 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013843
Victor Stinnera47082312012-10-04 02:19:54 +020013844 if (arg->flags & F_ALT)
13845 dtoa_flags = Py_DTSF_ALT;
13846 else
13847 dtoa_flags = 0;
13848 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013849 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013850 return -1;
13851 len = strlen(p);
13852 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013853 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013854 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013855 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013856 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013857 }
13858 else
13859 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013860 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013861 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013862}
13863
Victor Stinnerd0880d52012-04-27 23:40:13 +020013864/* formatlong() emulates the format codes d, u, o, x and X, and
13865 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13866 * Python's regular ints.
13867 * Return value: a new PyUnicodeObject*, or NULL if error.
13868 * The output string is of the form
13869 * "-"? ("0x" | "0X")? digit+
13870 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13871 * set in flags. The case of hex digits will be correct,
13872 * There will be at least prec digits, zero-filled on the left if
13873 * necessary to get that many.
13874 * val object to be converted
13875 * flags bitmask of format flags; only F_ALT is looked at
13876 * prec minimum number of digits; 0-fill on left if needed
13877 * type a character in [duoxX]; u acts the same as d
13878 *
13879 * CAUTION: o, x and X conversions on regular ints can never
13880 * produce a '-' sign, but can for Python's unbounded ints.
13881 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013882PyObject *
13883_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013884{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013885 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013886 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013887 Py_ssize_t i;
13888 int sign; /* 1 if '-', else 0 */
13889 int len; /* number of characters */
13890 Py_ssize_t llen;
13891 int numdigits; /* len == numnondigits + numdigits */
13892 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013893
Victor Stinnerd0880d52012-04-27 23:40:13 +020013894 /* Avoid exceeding SSIZE_T_MAX */
13895 if (prec > INT_MAX-3) {
13896 PyErr_SetString(PyExc_OverflowError,
13897 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013898 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013899 }
13900
13901 assert(PyLong_Check(val));
13902
13903 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013904 default:
13905 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013906 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013907 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013908 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013909 /* int and int subclasses should print numerically when a numeric */
13910 /* format code is used (see issue18780) */
13911 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013912 break;
13913 case 'o':
13914 numnondigits = 2;
13915 result = PyNumber_ToBase(val, 8);
13916 break;
13917 case 'x':
13918 case 'X':
13919 numnondigits = 2;
13920 result = PyNumber_ToBase(val, 16);
13921 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013922 }
13923 if (!result)
13924 return NULL;
13925
13926 assert(unicode_modifiable(result));
13927 assert(PyUnicode_IS_READY(result));
13928 assert(PyUnicode_IS_ASCII(result));
13929
13930 /* To modify the string in-place, there can only be one reference. */
13931 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013932 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013933 PyErr_BadInternalCall();
13934 return NULL;
13935 }
13936 buf = PyUnicode_DATA(result);
13937 llen = PyUnicode_GET_LENGTH(result);
13938 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013939 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013940 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013941 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013942 return NULL;
13943 }
13944 len = (int)llen;
13945 sign = buf[0] == '-';
13946 numnondigits += sign;
13947 numdigits = len - numnondigits;
13948 assert(numdigits > 0);
13949
13950 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013951 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013952 (type == 'o' || type == 'x' || type == 'X'))) {
13953 assert(buf[sign] == '0');
13954 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13955 buf[sign+1] == 'o');
13956 numnondigits -= 2;
13957 buf += 2;
13958 len -= 2;
13959 if (sign)
13960 buf[0] = '-';
13961 assert(len == numnondigits + numdigits);
13962 assert(numdigits > 0);
13963 }
13964
13965 /* Fill with leading zeroes to meet minimum width. */
13966 if (prec > numdigits) {
13967 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13968 numnondigits + prec);
13969 char *b1;
13970 if (!r1) {
13971 Py_DECREF(result);
13972 return NULL;
13973 }
13974 b1 = PyBytes_AS_STRING(r1);
13975 for (i = 0; i < numnondigits; ++i)
13976 *b1++ = *buf++;
13977 for (i = 0; i < prec - numdigits; i++)
13978 *b1++ = '0';
13979 for (i = 0; i < numdigits; i++)
13980 *b1++ = *buf++;
13981 *b1 = '\0';
13982 Py_DECREF(result);
13983 result = r1;
13984 buf = PyBytes_AS_STRING(result);
13985 len = numnondigits + prec;
13986 }
13987
13988 /* Fix up case for hex conversions. */
13989 if (type == 'X') {
13990 /* Need to convert all lower case letters to upper case.
13991 and need to convert 0x to 0X (and -0x to -0X). */
13992 for (i = 0; i < len; i++)
13993 if (buf[i] >= 'a' && buf[i] <= 'x')
13994 buf[i] -= 'a'-'A';
13995 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013996 if (!PyUnicode_Check(result)
13997 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013998 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013999 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014000 Py_DECREF(result);
14001 result = unicode;
14002 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014003 else if (len != PyUnicode_GET_LENGTH(result)) {
14004 if (PyUnicode_Resize(&result, len) < 0)
14005 Py_CLEAR(result);
14006 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014007 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014008}
14009
Ethan Furmandf3ed242014-01-05 06:50:30 -080014010/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014011 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014012 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014013 * -1 and raise an exception on error */
14014static int
Victor Stinnera47082312012-10-04 02:19:54 +020014015mainformatlong(PyObject *v,
14016 struct unicode_format_arg_t *arg,
14017 PyObject **p_output,
14018 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014019{
14020 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014021 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014022
14023 if (!PyNumber_Check(v))
14024 goto wrongtype;
14025
Ethan Furman9ab74802014-03-21 06:38:46 -070014026 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014027 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014028 if (type == 'o' || type == 'x' || type == 'X') {
14029 iobj = PyNumber_Index(v);
14030 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014031 if (PyErr_ExceptionMatches(PyExc_TypeError))
14032 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014033 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014034 }
14035 }
14036 else {
14037 iobj = PyNumber_Long(v);
14038 if (iobj == NULL ) {
14039 if (PyErr_ExceptionMatches(PyExc_TypeError))
14040 goto wrongtype;
14041 return -1;
14042 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014043 }
14044 assert(PyLong_Check(iobj));
14045 }
14046 else {
14047 iobj = v;
14048 Py_INCREF(iobj);
14049 }
14050
14051 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014052 && arg->width == -1 && arg->prec == -1
14053 && !(arg->flags & (F_SIGN | F_BLANK))
14054 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014055 {
14056 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014057 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014058 int base;
14059
Victor Stinnera47082312012-10-04 02:19:54 +020014060 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014061 {
14062 default:
14063 assert(0 && "'type' not in [diuoxX]");
14064 case 'd':
14065 case 'i':
14066 case 'u':
14067 base = 10;
14068 break;
14069 case 'o':
14070 base = 8;
14071 break;
14072 case 'x':
14073 case 'X':
14074 base = 16;
14075 break;
14076 }
14077
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014078 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14079 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014080 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014081 }
14082 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014083 return 1;
14084 }
14085
Ethan Furmanb95b5612015-01-23 20:05:18 -080014086 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014087 Py_DECREF(iobj);
14088 if (res == NULL)
14089 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014090 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014091 return 0;
14092
14093wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014094 switch(type)
14095 {
14096 case 'o':
14097 case 'x':
14098 case 'X':
14099 PyErr_Format(PyExc_TypeError,
14100 "%%%c format: an integer is required, "
14101 "not %.200s",
14102 type, Py_TYPE(v)->tp_name);
14103 break;
14104 default:
14105 PyErr_Format(PyExc_TypeError,
14106 "%%%c format: a number is required, "
14107 "not %.200s",
14108 type, Py_TYPE(v)->tp_name);
14109 break;
14110 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014111 return -1;
14112}
14113
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014114static Py_UCS4
14115formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014116{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014117 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014118 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014119 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014120 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014121 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014122 goto onError;
14123 }
14124 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014125 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014126 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014127 /* make sure number is a type of integer */
14128 if (!PyLong_Check(v)) {
14129 iobj = PyNumber_Index(v);
14130 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014131 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014132 }
14133 v = iobj;
14134 Py_DECREF(iobj);
14135 }
14136 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014137 x = PyLong_AsLong(v);
14138 if (x == -1 && PyErr_Occurred())
14139 goto onError;
14140
Victor Stinner8faf8212011-12-08 22:14:11 +010014141 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014142 PyErr_SetString(PyExc_OverflowError,
14143 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014144 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014145 }
14146
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014147 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014148 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014149
Benjamin Peterson29060642009-01-31 22:14:21 +000014150 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014151 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014152 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014153 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014154}
14155
Victor Stinnera47082312012-10-04 02:19:54 +020014156/* Parse options of an argument: flags, width, precision.
14157 Handle also "%(name)" syntax.
14158
14159 Return 0 if the argument has been formatted into arg->str.
14160 Return 1 if the argument has been written into ctx->writer,
14161 Raise an exception and return -1 on error. */
14162static int
14163unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14164 struct unicode_format_arg_t *arg)
14165{
14166#define FORMAT_READ(ctx) \
14167 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14168
14169 PyObject *v;
14170
Victor Stinnera47082312012-10-04 02:19:54 +020014171 if (arg->ch == '(') {
14172 /* Get argument value from a dictionary. Example: "%(name)s". */
14173 Py_ssize_t keystart;
14174 Py_ssize_t keylen;
14175 PyObject *key;
14176 int pcount = 1;
14177
14178 if (ctx->dict == NULL) {
14179 PyErr_SetString(PyExc_TypeError,
14180 "format requires a mapping");
14181 return -1;
14182 }
14183 ++ctx->fmtpos;
14184 --ctx->fmtcnt;
14185 keystart = ctx->fmtpos;
14186 /* Skip over balanced parentheses */
14187 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14188 arg->ch = FORMAT_READ(ctx);
14189 if (arg->ch == ')')
14190 --pcount;
14191 else if (arg->ch == '(')
14192 ++pcount;
14193 ctx->fmtpos++;
14194 }
14195 keylen = ctx->fmtpos - keystart - 1;
14196 if (ctx->fmtcnt < 0 || pcount > 0) {
14197 PyErr_SetString(PyExc_ValueError,
14198 "incomplete format key");
14199 return -1;
14200 }
14201 key = PyUnicode_Substring(ctx->fmtstr,
14202 keystart, keystart + keylen);
14203 if (key == NULL)
14204 return -1;
14205 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014206 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014207 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014208 }
14209 ctx->args = PyObject_GetItem(ctx->dict, key);
14210 Py_DECREF(key);
14211 if (ctx->args == NULL)
14212 return -1;
14213 ctx->args_owned = 1;
14214 ctx->arglen = -1;
14215 ctx->argidx = -2;
14216 }
14217
14218 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014219 while (--ctx->fmtcnt >= 0) {
14220 arg->ch = FORMAT_READ(ctx);
14221 ctx->fmtpos++;
14222 switch (arg->ch) {
14223 case '-': arg->flags |= F_LJUST; continue;
14224 case '+': arg->flags |= F_SIGN; continue;
14225 case ' ': arg->flags |= F_BLANK; continue;
14226 case '#': arg->flags |= F_ALT; continue;
14227 case '0': arg->flags |= F_ZERO; continue;
14228 }
14229 break;
14230 }
14231
14232 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014233 if (arg->ch == '*') {
14234 v = unicode_format_getnextarg(ctx);
14235 if (v == NULL)
14236 return -1;
14237 if (!PyLong_Check(v)) {
14238 PyErr_SetString(PyExc_TypeError,
14239 "* wants int");
14240 return -1;
14241 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014242 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014243 if (arg->width == -1 && PyErr_Occurred())
14244 return -1;
14245 if (arg->width < 0) {
14246 arg->flags |= F_LJUST;
14247 arg->width = -arg->width;
14248 }
14249 if (--ctx->fmtcnt >= 0) {
14250 arg->ch = FORMAT_READ(ctx);
14251 ctx->fmtpos++;
14252 }
14253 }
14254 else if (arg->ch >= '0' && arg->ch <= '9') {
14255 arg->width = arg->ch - '0';
14256 while (--ctx->fmtcnt >= 0) {
14257 arg->ch = FORMAT_READ(ctx);
14258 ctx->fmtpos++;
14259 if (arg->ch < '0' || arg->ch > '9')
14260 break;
14261 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14262 mixing signed and unsigned comparison. Since arg->ch is between
14263 '0' and '9', casting to int is safe. */
14264 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14265 PyErr_SetString(PyExc_ValueError,
14266 "width too big");
14267 return -1;
14268 }
14269 arg->width = arg->width*10 + (arg->ch - '0');
14270 }
14271 }
14272
14273 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014274 if (arg->ch == '.') {
14275 arg->prec = 0;
14276 if (--ctx->fmtcnt >= 0) {
14277 arg->ch = FORMAT_READ(ctx);
14278 ctx->fmtpos++;
14279 }
14280 if (arg->ch == '*') {
14281 v = unicode_format_getnextarg(ctx);
14282 if (v == NULL)
14283 return -1;
14284 if (!PyLong_Check(v)) {
14285 PyErr_SetString(PyExc_TypeError,
14286 "* wants int");
14287 return -1;
14288 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014289 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014290 if (arg->prec == -1 && PyErr_Occurred())
14291 return -1;
14292 if (arg->prec < 0)
14293 arg->prec = 0;
14294 if (--ctx->fmtcnt >= 0) {
14295 arg->ch = FORMAT_READ(ctx);
14296 ctx->fmtpos++;
14297 }
14298 }
14299 else if (arg->ch >= '0' && arg->ch <= '9') {
14300 arg->prec = arg->ch - '0';
14301 while (--ctx->fmtcnt >= 0) {
14302 arg->ch = FORMAT_READ(ctx);
14303 ctx->fmtpos++;
14304 if (arg->ch < '0' || arg->ch > '9')
14305 break;
14306 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14307 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014308 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014309 return -1;
14310 }
14311 arg->prec = arg->prec*10 + (arg->ch - '0');
14312 }
14313 }
14314 }
14315
14316 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14317 if (ctx->fmtcnt >= 0) {
14318 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14319 if (--ctx->fmtcnt >= 0) {
14320 arg->ch = FORMAT_READ(ctx);
14321 ctx->fmtpos++;
14322 }
14323 }
14324 }
14325 if (ctx->fmtcnt < 0) {
14326 PyErr_SetString(PyExc_ValueError,
14327 "incomplete format");
14328 return -1;
14329 }
14330 return 0;
14331
14332#undef FORMAT_READ
14333}
14334
14335/* Format one argument. Supported conversion specifiers:
14336
14337 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014338 - "i", "d", "u": int or float
14339 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014340 - "e", "E", "f", "F", "g", "G": float
14341 - "c": int or str (1 character)
14342
Victor Stinner8dbd4212012-12-04 09:30:24 +010014343 When possible, the output is written directly into the Unicode writer
14344 (ctx->writer). A string is created when padding is required.
14345
Victor Stinnera47082312012-10-04 02:19:54 +020014346 Return 0 if the argument has been formatted into *p_str,
14347 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014348 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014349static int
14350unicode_format_arg_format(struct unicode_formatter_t *ctx,
14351 struct unicode_format_arg_t *arg,
14352 PyObject **p_str)
14353{
14354 PyObject *v;
14355 _PyUnicodeWriter *writer = &ctx->writer;
14356
14357 if (ctx->fmtcnt == 0)
14358 ctx->writer.overallocate = 0;
14359
14360 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014361 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014362 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014363 return 1;
14364 }
14365
14366 v = unicode_format_getnextarg(ctx);
14367 if (v == NULL)
14368 return -1;
14369
Victor Stinnera47082312012-10-04 02:19:54 +020014370
14371 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014372 case 's':
14373 case 'r':
14374 case 'a':
14375 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14376 /* Fast path */
14377 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14378 return -1;
14379 return 1;
14380 }
14381
14382 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14383 *p_str = v;
14384 Py_INCREF(*p_str);
14385 }
14386 else {
14387 if (arg->ch == 's')
14388 *p_str = PyObject_Str(v);
14389 else if (arg->ch == 'r')
14390 *p_str = PyObject_Repr(v);
14391 else
14392 *p_str = PyObject_ASCII(v);
14393 }
14394 break;
14395
14396 case 'i':
14397 case 'd':
14398 case 'u':
14399 case 'o':
14400 case 'x':
14401 case 'X':
14402 {
14403 int ret = mainformatlong(v, arg, p_str, writer);
14404 if (ret != 0)
14405 return ret;
14406 arg->sign = 1;
14407 break;
14408 }
14409
14410 case 'e':
14411 case 'E':
14412 case 'f':
14413 case 'F':
14414 case 'g':
14415 case 'G':
14416 if (arg->width == -1 && arg->prec == -1
14417 && !(arg->flags & (F_SIGN | F_BLANK)))
14418 {
14419 /* Fast path */
14420 if (formatfloat(v, arg, NULL, writer) == -1)
14421 return -1;
14422 return 1;
14423 }
14424
14425 arg->sign = 1;
14426 if (formatfloat(v, arg, p_str, NULL) == -1)
14427 return -1;
14428 break;
14429
14430 case 'c':
14431 {
14432 Py_UCS4 ch = formatchar(v);
14433 if (ch == (Py_UCS4) -1)
14434 return -1;
14435 if (arg->width == -1 && arg->prec == -1) {
14436 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014437 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014438 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014439 return 1;
14440 }
14441 *p_str = PyUnicode_FromOrdinal(ch);
14442 break;
14443 }
14444
14445 default:
14446 PyErr_Format(PyExc_ValueError,
14447 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014448 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014449 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14450 (int)arg->ch,
14451 ctx->fmtpos - 1);
14452 return -1;
14453 }
14454 if (*p_str == NULL)
14455 return -1;
14456 assert (PyUnicode_Check(*p_str));
14457 return 0;
14458}
14459
14460static int
14461unicode_format_arg_output(struct unicode_formatter_t *ctx,
14462 struct unicode_format_arg_t *arg,
14463 PyObject *str)
14464{
14465 Py_ssize_t len;
14466 enum PyUnicode_Kind kind;
14467 void *pbuf;
14468 Py_ssize_t pindex;
14469 Py_UCS4 signchar;
14470 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014471 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014472 Py_ssize_t sublen;
14473 _PyUnicodeWriter *writer = &ctx->writer;
14474 Py_UCS4 fill;
14475
14476 fill = ' ';
14477 if (arg->sign && arg->flags & F_ZERO)
14478 fill = '0';
14479
14480 if (PyUnicode_READY(str) == -1)
14481 return -1;
14482
14483 len = PyUnicode_GET_LENGTH(str);
14484 if ((arg->width == -1 || arg->width <= len)
14485 && (arg->prec == -1 || arg->prec >= len)
14486 && !(arg->flags & (F_SIGN | F_BLANK)))
14487 {
14488 /* Fast path */
14489 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14490 return -1;
14491 return 0;
14492 }
14493
14494 /* Truncate the string for "s", "r" and "a" formats
14495 if the precision is set */
14496 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14497 if (arg->prec >= 0 && len > arg->prec)
14498 len = arg->prec;
14499 }
14500
14501 /* Adjust sign and width */
14502 kind = PyUnicode_KIND(str);
14503 pbuf = PyUnicode_DATA(str);
14504 pindex = 0;
14505 signchar = '\0';
14506 if (arg->sign) {
14507 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14508 if (ch == '-' || ch == '+') {
14509 signchar = ch;
14510 len--;
14511 pindex++;
14512 }
14513 else if (arg->flags & F_SIGN)
14514 signchar = '+';
14515 else if (arg->flags & F_BLANK)
14516 signchar = ' ';
14517 else
14518 arg->sign = 0;
14519 }
14520 if (arg->width < len)
14521 arg->width = len;
14522
14523 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014524 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014525 if (!(arg->flags & F_LJUST)) {
14526 if (arg->sign) {
14527 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014528 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014529 }
14530 else {
14531 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014532 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014533 }
14534 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014535 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14536 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014537 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014538 }
14539
Victor Stinnera47082312012-10-04 02:19:54 +020014540 buflen = arg->width;
14541 if (arg->sign && len == arg->width)
14542 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014543 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014544 return -1;
14545
14546 /* Write the sign if needed */
14547 if (arg->sign) {
14548 if (fill != ' ') {
14549 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14550 writer->pos += 1;
14551 }
14552 if (arg->width > len)
14553 arg->width--;
14554 }
14555
14556 /* Write the numeric prefix for "x", "X" and "o" formats
14557 if the alternate form is used.
14558 For example, write "0x" for the "%#x" format. */
14559 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14560 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14561 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14562 if (fill != ' ') {
14563 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14564 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14565 writer->pos += 2;
14566 pindex += 2;
14567 }
14568 arg->width -= 2;
14569 if (arg->width < 0)
14570 arg->width = 0;
14571 len -= 2;
14572 }
14573
14574 /* Pad left with the fill character if needed */
14575 if (arg->width > len && !(arg->flags & F_LJUST)) {
14576 sublen = arg->width - len;
14577 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14578 writer->pos += sublen;
14579 arg->width = len;
14580 }
14581
14582 /* If padding with spaces: write sign if needed and/or numeric prefix if
14583 the alternate form is used */
14584 if (fill == ' ') {
14585 if (arg->sign) {
14586 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14587 writer->pos += 1;
14588 }
14589 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14590 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14591 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14592 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14593 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14594 writer->pos += 2;
14595 pindex += 2;
14596 }
14597 }
14598
14599 /* Write characters */
14600 if (len) {
14601 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14602 str, pindex, len);
14603 writer->pos += len;
14604 }
14605
14606 /* Pad right with the fill character if needed */
14607 if (arg->width > len) {
14608 sublen = arg->width - len;
14609 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14610 writer->pos += sublen;
14611 }
14612 return 0;
14613}
14614
14615/* Helper of PyUnicode_Format(): format one arg.
14616 Return 0 on success, raise an exception and return -1 on error. */
14617static int
14618unicode_format_arg(struct unicode_formatter_t *ctx)
14619{
14620 struct unicode_format_arg_t arg;
14621 PyObject *str;
14622 int ret;
14623
Victor Stinner8dbd4212012-12-04 09:30:24 +010014624 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14625 arg.flags = 0;
14626 arg.width = -1;
14627 arg.prec = -1;
14628 arg.sign = 0;
14629 str = NULL;
14630
Victor Stinnera47082312012-10-04 02:19:54 +020014631 ret = unicode_format_arg_parse(ctx, &arg);
14632 if (ret == -1)
14633 return -1;
14634
14635 ret = unicode_format_arg_format(ctx, &arg, &str);
14636 if (ret == -1)
14637 return -1;
14638
14639 if (ret != 1) {
14640 ret = unicode_format_arg_output(ctx, &arg, str);
14641 Py_DECREF(str);
14642 if (ret == -1)
14643 return -1;
14644 }
14645
14646 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14647 PyErr_SetString(PyExc_TypeError,
14648 "not all arguments converted during string formatting");
14649 return -1;
14650 }
14651 return 0;
14652}
14653
Alexander Belopolsky40018472011-02-26 01:02:56 +000014654PyObject *
14655PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014656{
Victor Stinnera47082312012-10-04 02:19:54 +020014657 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014658
Guido van Rossumd57fd912000-03-10 22:53:23 +000014659 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014660 PyErr_BadInternalCall();
14661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014662 }
Victor Stinnera47082312012-10-04 02:19:54 +020014663
14664 ctx.fmtstr = PyUnicode_FromObject(format);
14665 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014666 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014667 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14668 Py_DECREF(ctx.fmtstr);
14669 return NULL;
14670 }
14671 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14672 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14673 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14674 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014675
Victor Stinner8f674cc2013-04-17 23:02:17 +020014676 _PyUnicodeWriter_Init(&ctx.writer);
14677 ctx.writer.min_length = ctx.fmtcnt + 100;
14678 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014679
Guido van Rossumd57fd912000-03-10 22:53:23 +000014680 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014681 ctx.arglen = PyTuple_Size(args);
14682 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014683 }
14684 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014685 ctx.arglen = -1;
14686 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014687 }
Victor Stinnera47082312012-10-04 02:19:54 +020014688 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014689 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014690 ctx.dict = args;
14691 else
14692 ctx.dict = NULL;
14693 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014694
Victor Stinnera47082312012-10-04 02:19:54 +020014695 while (--ctx.fmtcnt >= 0) {
14696 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014697 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014698
14699 nonfmtpos = ctx.fmtpos++;
14700 while (ctx.fmtcnt >= 0 &&
14701 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14702 ctx.fmtpos++;
14703 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014704 }
Victor Stinnera47082312012-10-04 02:19:54 +020014705 if (ctx.fmtcnt < 0) {
14706 ctx.fmtpos--;
14707 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014708 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014709
Victor Stinnercfc4c132013-04-03 01:48:39 +020014710 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14711 nonfmtpos, ctx.fmtpos) < 0)
14712 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014713 }
14714 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014715 ctx.fmtpos++;
14716 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014717 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014718 }
14719 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014720
Victor Stinnera47082312012-10-04 02:19:54 +020014721 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014722 PyErr_SetString(PyExc_TypeError,
14723 "not all arguments converted during string formatting");
14724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014725 }
14726
Victor Stinnera47082312012-10-04 02:19:54 +020014727 if (ctx.args_owned) {
14728 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014729 }
Victor Stinnera47082312012-10-04 02:19:54 +020014730 Py_DECREF(ctx.fmtstr);
14731 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014732
Benjamin Peterson29060642009-01-31 22:14:21 +000014733 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014734 Py_DECREF(ctx.fmtstr);
14735 _PyUnicodeWriter_Dealloc(&ctx.writer);
14736 if (ctx.args_owned) {
14737 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014738 }
14739 return NULL;
14740}
14741
Jeremy Hylton938ace62002-07-17 16:30:39 +000014742static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014743unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14744
Tim Peters6d6c1a32001-08-02 04:15:00 +000014745static PyObject *
14746unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14747{
Benjamin Peterson29060642009-01-31 22:14:21 +000014748 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014749 static char *kwlist[] = {"object", "encoding", "errors", 0};
14750 char *encoding = NULL;
14751 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014752
Benjamin Peterson14339b62009-01-31 16:36:08 +000014753 if (type != &PyUnicode_Type)
14754 return unicode_subtype_new(type, args, kwds);
14755 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014756 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014757 return NULL;
14758 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014759 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014760 if (encoding == NULL && errors == NULL)
14761 return PyObject_Str(x);
14762 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014763 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014764}
14765
Guido van Rossume023fe02001-08-30 03:12:59 +000014766static PyObject *
14767unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14768{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014769 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014770 Py_ssize_t length, char_size;
14771 int share_wstr, share_utf8;
14772 unsigned int kind;
14773 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014774
Benjamin Peterson14339b62009-01-31 16:36:08 +000014775 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014776
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014777 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014778 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014779 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014780 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014781 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014782 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014783 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014784 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014785
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014786 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014787 if (self == NULL) {
14788 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014789 return NULL;
14790 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014791 kind = PyUnicode_KIND(unicode);
14792 length = PyUnicode_GET_LENGTH(unicode);
14793
14794 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014795#ifdef Py_DEBUG
14796 _PyUnicode_HASH(self) = -1;
14797#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014798 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014799#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014800 _PyUnicode_STATE(self).interned = 0;
14801 _PyUnicode_STATE(self).kind = kind;
14802 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014803 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014804 _PyUnicode_STATE(self).ready = 1;
14805 _PyUnicode_WSTR(self) = NULL;
14806 _PyUnicode_UTF8_LENGTH(self) = 0;
14807 _PyUnicode_UTF8(self) = NULL;
14808 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014809 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014810
14811 share_utf8 = 0;
14812 share_wstr = 0;
14813 if (kind == PyUnicode_1BYTE_KIND) {
14814 char_size = 1;
14815 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14816 share_utf8 = 1;
14817 }
14818 else if (kind == PyUnicode_2BYTE_KIND) {
14819 char_size = 2;
14820 if (sizeof(wchar_t) == 2)
14821 share_wstr = 1;
14822 }
14823 else {
14824 assert(kind == PyUnicode_4BYTE_KIND);
14825 char_size = 4;
14826 if (sizeof(wchar_t) == 4)
14827 share_wstr = 1;
14828 }
14829
14830 /* Ensure we won't overflow the length. */
14831 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14832 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014833 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014834 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014835 data = PyObject_MALLOC((length + 1) * char_size);
14836 if (data == NULL) {
14837 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014838 goto onError;
14839 }
14840
Victor Stinnerc3c74152011-10-02 20:39:55 +020014841 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014842 if (share_utf8) {
14843 _PyUnicode_UTF8_LENGTH(self) = length;
14844 _PyUnicode_UTF8(self) = data;
14845 }
14846 if (share_wstr) {
14847 _PyUnicode_WSTR_LENGTH(self) = length;
14848 _PyUnicode_WSTR(self) = (wchar_t *)data;
14849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014850
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014851 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014852 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014853 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014854#ifdef Py_DEBUG
14855 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14856#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014857 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014858 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014859
14860onError:
14861 Py_DECREF(unicode);
14862 Py_DECREF(self);
14863 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014864}
14865
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014866PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014867"str(object='') -> str\n\
14868str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014869\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014870Create a new string object from the given object. If encoding or\n\
14871errors is specified, then the object must expose a data buffer\n\
14872that will be decoded using the given encoding and error handler.\n\
14873Otherwise, returns the result of object.__str__() (if defined)\n\
14874or repr(object).\n\
14875encoding defaults to sys.getdefaultencoding().\n\
14876errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014877
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014878static PyObject *unicode_iter(PyObject *seq);
14879
Guido van Rossumd57fd912000-03-10 22:53:23 +000014880PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014881 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014882 "str", /* tp_name */
14883 sizeof(PyUnicodeObject), /* tp_size */
14884 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014885 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014886 (destructor)unicode_dealloc, /* tp_dealloc */
14887 0, /* tp_print */
14888 0, /* tp_getattr */
14889 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014890 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014891 unicode_repr, /* tp_repr */
14892 &unicode_as_number, /* tp_as_number */
14893 &unicode_as_sequence, /* tp_as_sequence */
14894 &unicode_as_mapping, /* tp_as_mapping */
14895 (hashfunc) unicode_hash, /* tp_hash*/
14896 0, /* tp_call*/
14897 (reprfunc) unicode_str, /* tp_str */
14898 PyObject_GenericGetAttr, /* tp_getattro */
14899 0, /* tp_setattro */
14900 0, /* tp_as_buffer */
14901 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014902 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014903 unicode_doc, /* tp_doc */
14904 0, /* tp_traverse */
14905 0, /* tp_clear */
14906 PyUnicode_RichCompare, /* tp_richcompare */
14907 0, /* tp_weaklistoffset */
14908 unicode_iter, /* tp_iter */
14909 0, /* tp_iternext */
14910 unicode_methods, /* tp_methods */
14911 0, /* tp_members */
14912 0, /* tp_getset */
14913 &PyBaseObject_Type, /* tp_base */
14914 0, /* tp_dict */
14915 0, /* tp_descr_get */
14916 0, /* tp_descr_set */
14917 0, /* tp_dictoffset */
14918 0, /* tp_init */
14919 0, /* tp_alloc */
14920 unicode_new, /* tp_new */
14921 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014922};
14923
14924/* Initialize the Unicode implementation */
14925
Victor Stinner3a50e702011-10-18 21:21:00 +020014926int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014927{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014928 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014929 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014930 0x000A, /* LINE FEED */
14931 0x000D, /* CARRIAGE RETURN */
14932 0x001C, /* FILE SEPARATOR */
14933 0x001D, /* GROUP SEPARATOR */
14934 0x001E, /* RECORD SEPARATOR */
14935 0x0085, /* NEXT LINE */
14936 0x2028, /* LINE SEPARATOR */
14937 0x2029, /* PARAGRAPH SEPARATOR */
14938 };
14939
Fred Drakee4315f52000-05-09 19:53:39 +000014940 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014941 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014942 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014943 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014944 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014945
Guido van Rossumcacfc072002-05-24 19:01:59 +000014946 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014947 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014948
14949 /* initialize the linebreak bloom filter */
14950 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014951 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014952 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014953
Christian Heimes26532f72013-07-20 14:57:16 +020014954 if (PyType_Ready(&EncodingMapType) < 0)
14955 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014956
Benjamin Petersonc4311282012-10-30 23:21:10 -040014957 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14958 Py_FatalError("Can't initialize field name iterator type");
14959
14960 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14961 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014962
Victor Stinner3a50e702011-10-18 21:21:00 +020014963 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014964}
14965
14966/* Finalize the Unicode implementation */
14967
Christian Heimesa156e092008-02-16 07:38:31 +000014968int
14969PyUnicode_ClearFreeList(void)
14970{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014971 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014972}
14973
Guido van Rossumd57fd912000-03-10 22:53:23 +000014974void
Thomas Wouters78890102000-07-22 19:25:51 +000014975_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014976{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014977 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014978
Serhiy Storchaka05997252013-01-26 12:14:02 +020014979 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014980
Serhiy Storchaka05997252013-01-26 12:14:02 +020014981 for (i = 0; i < 256; i++)
14982 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014983 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014984 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014985}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014986
Walter Dörwald16807132007-05-25 13:52:07 +000014987void
14988PyUnicode_InternInPlace(PyObject **p)
14989{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014990 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014991 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014992#ifdef Py_DEBUG
14993 assert(s != NULL);
14994 assert(_PyUnicode_CHECK(s));
14995#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014996 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014997 return;
14998#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014999 /* If it's a subclass, we don't really know what putting
15000 it in the interned dict might do. */
15001 if (!PyUnicode_CheckExact(s))
15002 return;
15003 if (PyUnicode_CHECK_INTERNED(s))
15004 return;
15005 if (interned == NULL) {
15006 interned = PyDict_New();
15007 if (interned == NULL) {
15008 PyErr_Clear(); /* Don't leave an exception */
15009 return;
15010 }
15011 }
15012 /* It might be that the GetItem call fails even
15013 though the key is present in the dictionary,
15014 namely when this happens during a stack overflow. */
15015 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015016 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015017 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015018
Victor Stinnerf0335102013-04-14 19:13:03 +020015019 if (t) {
15020 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015021 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015022 return;
15023 }
Walter Dörwald16807132007-05-25 13:52:07 +000015024
Benjamin Peterson14339b62009-01-31 16:36:08 +000015025 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015026 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015027 PyErr_Clear();
15028 PyThreadState_GET()->recursion_critical = 0;
15029 return;
15030 }
15031 PyThreadState_GET()->recursion_critical = 0;
15032 /* The two references in interned are not counted by refcnt.
15033 The deallocator will take care of this */
15034 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015035 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015036}
15037
15038void
15039PyUnicode_InternImmortal(PyObject **p)
15040{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015041 PyUnicode_InternInPlace(p);
15042 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015043 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015044 Py_INCREF(*p);
15045 }
Walter Dörwald16807132007-05-25 13:52:07 +000015046}
15047
15048PyObject *
15049PyUnicode_InternFromString(const char *cp)
15050{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015051 PyObject *s = PyUnicode_FromString(cp);
15052 if (s == NULL)
15053 return NULL;
15054 PyUnicode_InternInPlace(&s);
15055 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015056}
15057
Alexander Belopolsky40018472011-02-26 01:02:56 +000015058void
15059_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015060{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015061 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015062 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015063 Py_ssize_t i, n;
15064 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015065
Benjamin Peterson14339b62009-01-31 16:36:08 +000015066 if (interned == NULL || !PyDict_Check(interned))
15067 return;
15068 keys = PyDict_Keys(interned);
15069 if (keys == NULL || !PyList_Check(keys)) {
15070 PyErr_Clear();
15071 return;
15072 }
Walter Dörwald16807132007-05-25 13:52:07 +000015073
Benjamin Peterson14339b62009-01-31 16:36:08 +000015074 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15075 detector, interned unicode strings are not forcibly deallocated;
15076 rather, we give them their stolen references back, and then clear
15077 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015078
Benjamin Peterson14339b62009-01-31 16:36:08 +000015079 n = PyList_GET_SIZE(keys);
15080 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015081 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015082 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015083 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015084 if (PyUnicode_READY(s) == -1) {
15085 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015086 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015088 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015089 case SSTATE_NOT_INTERNED:
15090 /* XXX Shouldn't happen */
15091 break;
15092 case SSTATE_INTERNED_IMMORTAL:
15093 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015094 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015095 break;
15096 case SSTATE_INTERNED_MORTAL:
15097 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015098 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015099 break;
15100 default:
15101 Py_FatalError("Inconsistent interned string state.");
15102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015103 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015104 }
15105 fprintf(stderr, "total size of all interned strings: "
15106 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15107 "mortal/immortal\n", mortal_size, immortal_size);
15108 Py_DECREF(keys);
15109 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015110 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015111}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015112
15113
15114/********************* Unicode Iterator **************************/
15115
15116typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015117 PyObject_HEAD
15118 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015119 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015120} unicodeiterobject;
15121
15122static void
15123unicodeiter_dealloc(unicodeiterobject *it)
15124{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015125 _PyObject_GC_UNTRACK(it);
15126 Py_XDECREF(it->it_seq);
15127 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015128}
15129
15130static int
15131unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15132{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015133 Py_VISIT(it->it_seq);
15134 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015135}
15136
15137static PyObject *
15138unicodeiter_next(unicodeiterobject *it)
15139{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015140 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015141
Benjamin Peterson14339b62009-01-31 16:36:08 +000015142 assert(it != NULL);
15143 seq = it->it_seq;
15144 if (seq == NULL)
15145 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015146 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015148 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15149 int kind = PyUnicode_KIND(seq);
15150 void *data = PyUnicode_DATA(seq);
15151 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15152 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015153 if (item != NULL)
15154 ++it->it_index;
15155 return item;
15156 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015157
Benjamin Peterson14339b62009-01-31 16:36:08 +000015158 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015159 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015160 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015161}
15162
15163static PyObject *
15164unicodeiter_len(unicodeiterobject *it)
15165{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015166 Py_ssize_t len = 0;
15167 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015168 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015169 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015170}
15171
15172PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15173
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015174static PyObject *
15175unicodeiter_reduce(unicodeiterobject *it)
15176{
15177 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015178 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015179 it->it_seq, it->it_index);
15180 } else {
15181 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15182 if (u == NULL)
15183 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015184 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015185 }
15186}
15187
15188PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15189
15190static PyObject *
15191unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15192{
15193 Py_ssize_t index = PyLong_AsSsize_t(state);
15194 if (index == -1 && PyErr_Occurred())
15195 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015196 if (it->it_seq != NULL) {
15197 if (index < 0)
15198 index = 0;
15199 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15200 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15201 it->it_index = index;
15202 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015203 Py_RETURN_NONE;
15204}
15205
15206PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15207
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015208static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015209 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015210 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015211 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15212 reduce_doc},
15213 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15214 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015215 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015216};
15217
15218PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015219 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15220 "str_iterator", /* tp_name */
15221 sizeof(unicodeiterobject), /* tp_basicsize */
15222 0, /* tp_itemsize */
15223 /* methods */
15224 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15225 0, /* tp_print */
15226 0, /* tp_getattr */
15227 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015228 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015229 0, /* tp_repr */
15230 0, /* tp_as_number */
15231 0, /* tp_as_sequence */
15232 0, /* tp_as_mapping */
15233 0, /* tp_hash */
15234 0, /* tp_call */
15235 0, /* tp_str */
15236 PyObject_GenericGetAttr, /* tp_getattro */
15237 0, /* tp_setattro */
15238 0, /* tp_as_buffer */
15239 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15240 0, /* tp_doc */
15241 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15242 0, /* tp_clear */
15243 0, /* tp_richcompare */
15244 0, /* tp_weaklistoffset */
15245 PyObject_SelfIter, /* tp_iter */
15246 (iternextfunc)unicodeiter_next, /* tp_iternext */
15247 unicodeiter_methods, /* tp_methods */
15248 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015249};
15250
15251static PyObject *
15252unicode_iter(PyObject *seq)
15253{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015254 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015255
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 if (!PyUnicode_Check(seq)) {
15257 PyErr_BadInternalCall();
15258 return NULL;
15259 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015260 if (PyUnicode_READY(seq) == -1)
15261 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015262 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15263 if (it == NULL)
15264 return NULL;
15265 it->it_index = 0;
15266 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015267 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015268 _PyObject_GC_TRACK(it);
15269 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015270}
15271
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015272
15273size_t
15274Py_UNICODE_strlen(const Py_UNICODE *u)
15275{
15276 int res = 0;
15277 while(*u++)
15278 res++;
15279 return res;
15280}
15281
15282Py_UNICODE*
15283Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15284{
15285 Py_UNICODE *u = s1;
15286 while ((*u++ = *s2++));
15287 return s1;
15288}
15289
15290Py_UNICODE*
15291Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15292{
15293 Py_UNICODE *u = s1;
15294 while ((*u++ = *s2++))
15295 if (n-- == 0)
15296 break;
15297 return s1;
15298}
15299
15300Py_UNICODE*
15301Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15302{
15303 Py_UNICODE *u1 = s1;
15304 u1 += Py_UNICODE_strlen(u1);
15305 Py_UNICODE_strcpy(u1, s2);
15306 return s1;
15307}
15308
15309int
15310Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15311{
15312 while (*s1 && *s2 && *s1 == *s2)
15313 s1++, s2++;
15314 if (*s1 && *s2)
15315 return (*s1 < *s2) ? -1 : +1;
15316 if (*s1)
15317 return 1;
15318 if (*s2)
15319 return -1;
15320 return 0;
15321}
15322
15323int
15324Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15325{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015326 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015327 for (; n != 0; n--) {
15328 u1 = *s1;
15329 u2 = *s2;
15330 if (u1 != u2)
15331 return (u1 < u2) ? -1 : +1;
15332 if (u1 == '\0')
15333 return 0;
15334 s1++;
15335 s2++;
15336 }
15337 return 0;
15338}
15339
15340Py_UNICODE*
15341Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15342{
15343 const Py_UNICODE *p;
15344 for (p = s; *p; p++)
15345 if (*p == c)
15346 return (Py_UNICODE*)p;
15347 return NULL;
15348}
15349
15350Py_UNICODE*
15351Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15352{
15353 const Py_UNICODE *p;
15354 p = s + Py_UNICODE_strlen(s);
15355 while (p != s) {
15356 p--;
15357 if (*p == c)
15358 return (Py_UNICODE*)p;
15359 }
15360 return NULL;
15361}
Victor Stinner331ea922010-08-10 16:37:20 +000015362
Victor Stinner71133ff2010-09-01 23:43:53 +000015363Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015364PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015365{
Victor Stinner577db2c2011-10-11 22:12:48 +020015366 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015367 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015369 if (!PyUnicode_Check(unicode)) {
15370 PyErr_BadArgument();
15371 return NULL;
15372 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015373 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015374 if (u == NULL)
15375 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015376 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015377 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015378 PyErr_NoMemory();
15379 return NULL;
15380 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015381 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015382 size *= sizeof(Py_UNICODE);
15383 copy = PyMem_Malloc(size);
15384 if (copy == NULL) {
15385 PyErr_NoMemory();
15386 return NULL;
15387 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015388 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015389 return copy;
15390}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015391
Georg Brandl66c221e2010-10-14 07:04:07 +000015392/* A _string module, to export formatter_parser and formatter_field_name_split
15393 to the string.Formatter class implemented in Python. */
15394
15395static PyMethodDef _string_methods[] = {
15396 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15397 METH_O, PyDoc_STR("split the argument as a field name")},
15398 {"formatter_parser", (PyCFunction) formatter_parser,
15399 METH_O, PyDoc_STR("parse the argument as a format string")},
15400 {NULL, NULL}
15401};
15402
15403static struct PyModuleDef _string_module = {
15404 PyModuleDef_HEAD_INIT,
15405 "_string",
15406 PyDoc_STR("string helper module"),
15407 0,
15408 _string_methods,
15409 NULL,
15410 NULL,
15411 NULL,
15412 NULL
15413};
15414
15415PyMODINIT_FUNC
15416PyInit__string(void)
15417{
15418 return PyModule_Create(&_string_module);
15419}
15420
15421
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015422#ifdef __cplusplus
15423}
15424#endif