blob: b146da952db9e98f64213eea6192b85617d6d319 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300293#include "clinic/unicodeobject.c.h"
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527 to keep things simple, we use a single bitmask, using the least 5
528 bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
Antoine Pitrouf068f942010-01-13 14:19:12 +0000532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542#define BLOOM_MASK unsigned long
543
Serhiy Storchaka05997252013-01-26 12:14:02 +0200544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Benjamin Peterson29060642009-01-31 22:14:21 +0000548#define BLOOM_LINEBREAK(ch) \
549 ((ch) < 128U ? ascii_linebreak[(ch)] : \
550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Alexander Belopolsky40018472011-02-26 01:02:56 +0000552Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554{
Victor Stinnera85af502013-04-09 21:53:54 +0200555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
556 do { \
557 TYPE *data = (TYPE *)PTR; \
558 TYPE *end = data + LEN; \
559 Py_UCS4 ch; \
560 for (; data != end; data++) { \
561 ch = *data; \
562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563 } \
564 break; \
565 } while (0)
566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* calculate simple bloom-style bitmask for a given unicode string */
568
Antoine Pitrouf068f942010-01-13 14:19:12 +0000569 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200572 switch (kind) {
573 case PyUnicode_1BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575 break;
576 case PyUnicode_2BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578 break;
579 case PyUnicode_4BYTE_KIND:
580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581 break;
582 default:
583 assert(0);
584 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200586
587#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000588}
589
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200590/* Compilation of templated routines */
591
592#include "stringlib/asciilib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs1lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300608#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200609#include "stringlib/find_max_char.h"
610#include "stringlib/localeutil.h"
611#include "stringlib/undef.h"
612
613#include "stringlib/ucs2lib.h"
614#include "stringlib/fastsearch.h"
615#include "stringlib/partition.h"
616#include "stringlib/split.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300619#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200620#include "stringlib/find_max_char.h"
621#include "stringlib/localeutil.h"
622#include "stringlib/undef.h"
623
624#include "stringlib/ucs4lib.h"
625#include "stringlib/fastsearch.h"
626#include "stringlib/partition.h"
627#include "stringlib/split.h"
628#include "stringlib/count.h"
629#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300630#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200631#include "stringlib/find_max_char.h"
632#include "stringlib/localeutil.h"
633#include "stringlib/undef.h"
634
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635#include "stringlib/unicodedefs.h"
636#include "stringlib/fastsearch.h"
637#include "stringlib/count.h"
638#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100639#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200640
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641/* --- Unicode Object ----------------------------------------------------- */
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200644fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200646Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 Py_ssize_t size, Py_UCS4 ch,
648 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
651
652 switch (kind) {
653 case PyUnicode_1BYTE_KIND:
654 {
655 Py_UCS1 ch1 = (Py_UCS1) ch;
656 if (ch1 == ch)
657 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
658 else
659 return -1;
660 }
661 case PyUnicode_2BYTE_KIND:
662 {
663 Py_UCS2 ch2 = (Py_UCS2) ch;
664 if (ch2 == ch)
665 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
666 else
667 return -1;
668 }
669 case PyUnicode_4BYTE_KIND:
670 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
671 default:
672 assert(0);
673 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675}
676
Victor Stinnerafffce42012-10-03 23:03:17 +0200677#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000678/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200679 earlier.
680
681 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
682 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
683 invalid character in Unicode 6.0. */
684static void
685unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
686{
687 int kind = PyUnicode_KIND(unicode);
688 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
689 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
690 if (length <= old_length)
691 return;
692 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
693}
694#endif
695
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696static PyObject*
697resize_compact(PyObject *unicode, Py_ssize_t length)
698{
699 Py_ssize_t char_size;
700 Py_ssize_t struct_size;
701 Py_ssize_t new_size;
702 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100703 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200704#ifdef Py_DEBUG
705 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
706#endif
707
Victor Stinner79891572012-05-03 13:43:07 +0200708 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100710 assert(PyUnicode_IS_COMPACT(unicode));
711
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200714 struct_size = sizeof(PyASCIIObject);
715 else
716 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200717 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
720 PyErr_NoMemory();
721 return NULL;
722 }
723 new_size = (struct_size + (length + 1) * char_size);
724
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200725 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
726 PyObject_DEL(_PyUnicode_UTF8(unicode));
727 _PyUnicode_UTF8(unicode) = NULL;
728 _PyUnicode_UTF8_LENGTH(unicode) = 0;
729 }
Victor Stinner84def372011-12-11 20:04:56 +0100730 _Py_DEC_REFTOTAL;
731 _Py_ForgetReference(unicode);
732
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300733 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100734 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100735 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 PyErr_NoMemory();
737 return NULL;
738 }
Victor Stinner84def372011-12-11 20:04:56 +0100739 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100741
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100745 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200746 _PyUnicode_WSTR_LENGTH(unicode) = length;
747 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100748 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
749 PyObject_DEL(_PyUnicode_WSTR(unicode));
750 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +0100751 if (!PyUnicode_IS_ASCII(unicode))
752 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100753 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200754#ifdef Py_DEBUG
755 unicode_fill_invalid(unicode, old_length);
756#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200757 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
758 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200759 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 return unicode;
761}
762
Alexander Belopolsky40018472011-02-26 01:02:56 +0000763static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200764resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000765{
Victor Stinner95663112011-10-04 01:03:50 +0200766 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100767 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200768 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000770
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 if (PyUnicode_IS_READY(unicode)) {
772 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200773 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200775#ifdef Py_DEBUG
776 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
777#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200780 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200781 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
782 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200783
784 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
785 PyErr_NoMemory();
786 return -1;
787 }
788 new_size = (length + 1) * char_size;
789
Victor Stinner7a9105a2011-12-12 00:13:42 +0100790 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
791 {
792 PyObject_DEL(_PyUnicode_UTF8(unicode));
793 _PyUnicode_UTF8(unicode) = NULL;
794 _PyUnicode_UTF8_LENGTH(unicode) = 0;
795 }
796
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797 data = (PyObject *)PyObject_REALLOC(data, new_size);
798 if (data == NULL) {
799 PyErr_NoMemory();
800 return -1;
801 }
802 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200803 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200805 _PyUnicode_WSTR_LENGTH(unicode) = length;
806 }
807 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200808 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200809 _PyUnicode_UTF8_LENGTH(unicode) = length;
810 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 _PyUnicode_LENGTH(unicode) = length;
812 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200813#ifdef Py_DEBUG
814 unicode_fill_invalid(unicode, old_length);
815#endif
Victor Stinner95663112011-10-04 01:03:50 +0200816 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200817 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200819 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200820 }
Victor Stinner95663112011-10-04 01:03:50 +0200821 assert(_PyUnicode_WSTR(unicode) != NULL);
822
823 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700824 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200825 PyErr_NoMemory();
826 return -1;
827 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100828 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200829 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100830 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200831 if (!wstr) {
832 PyErr_NoMemory();
833 return -1;
834 }
835 _PyUnicode_WSTR(unicode) = wstr;
836 _PyUnicode_WSTR(unicode)[length] = 0;
837 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200838 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000839 return 0;
840}
841
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842static PyObject*
843resize_copy(PyObject *unicode, Py_ssize_t length)
844{
845 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100846 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200847 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100848
Benjamin Petersonbac79492012-01-14 13:34:47 -0500849 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100850 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200851
852 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
853 if (copy == NULL)
854 return NULL;
855
856 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200857 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200858 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200859 }
860 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200861 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100862
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200863 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200864 if (w == NULL)
865 return NULL;
866 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
867 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200868 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
869 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200870 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200871 }
872}
873
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000875 Ux0000 terminated; some code (e.g. new_identifier)
876 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000877
878 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000879 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880
881*/
882
Alexander Belopolsky40018472011-02-26 01:02:56 +0000883static PyUnicodeObject *
884_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200886 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888
Thomas Wouters477c8d52006-05-27 19:21:47 +0000889 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 if (length == 0 && unicode_empty != NULL) {
891 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200892 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 }
894
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000895 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700896 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000897 return (PyUnicodeObject *)PyErr_NoMemory();
898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899 if (length < 0) {
900 PyErr_SetString(PyExc_SystemError,
901 "Negative size passed to _PyUnicode_New");
902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000903 }
904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
906 if (unicode == NULL)
907 return NULL;
908 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100909
910 _PyUnicode_WSTR_LENGTH(unicode) = length;
911 _PyUnicode_HASH(unicode) = -1;
912 _PyUnicode_STATE(unicode).interned = 0;
913 _PyUnicode_STATE(unicode).kind = 0;
914 _PyUnicode_STATE(unicode).compact = 0;
915 _PyUnicode_STATE(unicode).ready = 0;
916 _PyUnicode_STATE(unicode).ascii = 0;
917 _PyUnicode_DATA_ANY(unicode) = NULL;
918 _PyUnicode_LENGTH(unicode) = 0;
919 _PyUnicode_UTF8(unicode) = NULL;
920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
923 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100924 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000925 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100926 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200928
Jeremy Hyltond8082792003-09-16 19:41:39 +0000929 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000930 * the caller fails before initializing str -- unicode_resize()
931 * reads str[0], and the Keep-Alive optimization can keep memory
932 * allocated for str alive across a call to unicode_dealloc(unicode).
933 * We don't want unicode_resize to read uninitialized memory in
934 * that case.
935 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936 _PyUnicode_WSTR(unicode)[0] = 0;
937 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100938
Victor Stinner7931d9a2011-11-04 00:22:48 +0100939 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940 return unicode;
941}
942
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943static const char*
944unicode_kind_name(PyObject *unicode)
945{
Victor Stinner42dfd712011-10-03 14:41:45 +0200946 /* don't check consistency: unicode_kind_name() is called from
947 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 if (!PyUnicode_IS_COMPACT(unicode))
949 {
950 if (!PyUnicode_IS_READY(unicode))
951 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600952 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200953 {
954 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200955 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200956 return "legacy ascii";
957 else
958 return "legacy latin1";
959 case PyUnicode_2BYTE_KIND:
960 return "legacy UCS2";
961 case PyUnicode_4BYTE_KIND:
962 return "legacy UCS4";
963 default:
964 return "<legacy invalid kind>";
965 }
966 }
967 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600968 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 return "ascii";
972 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200973 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200974 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200975 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200976 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200977 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200978 default:
979 return "<invalid compact kind>";
980 }
981}
982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984/* Functions wrapping macros for use in debugger */
985char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200986 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987}
988
989void *_PyUnicode_compact_data(void *unicode) {
990 return _PyUnicode_COMPACT_DATA(unicode);
991}
992void *_PyUnicode_data(void *unicode){
993 printf("obj %p\n", unicode);
994 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
995 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
996 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
997 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
998 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
999 return PyUnicode_DATA(unicode);
1000}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001001
1002void
1003_PyUnicode_Dump(PyObject *op)
1004{
1005 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001006 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1007 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1008 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001009
Victor Stinnera849a4b2011-10-03 12:12:11 +02001010 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001011 {
1012 if (ascii->state.ascii)
1013 data = (ascii + 1);
1014 else
1015 data = (compact + 1);
1016 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001017 else
1018 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001019 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1020 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001021
Victor Stinnera849a4b2011-10-03 12:12:11 +02001022 if (ascii->wstr == data)
1023 printf("shared ");
1024 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001025
Victor Stinnera3b334d2011-10-03 13:53:37 +02001026 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001027 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001028 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1029 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001030 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1031 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001032 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001033 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001034}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035#endif
1036
1037PyObject *
1038PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1039{
1040 PyObject *obj;
1041 PyCompactUnicodeObject *unicode;
1042 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001043 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001044 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 Py_ssize_t char_size;
1046 Py_ssize_t struct_size;
1047
1048 /* Optimization for empty strings */
1049 if (size == 0 && unicode_empty != NULL) {
1050 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001051 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 }
1053
Victor Stinner9e9d6892011-10-04 01:02:02 +02001054 is_ascii = 0;
1055 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056 struct_size = sizeof(PyCompactUnicodeObject);
1057 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001058 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 char_size = 1;
1060 is_ascii = 1;
1061 struct_size = sizeof(PyASCIIObject);
1062 }
1063 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001064 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 char_size = 1;
1066 }
1067 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001068 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 char_size = 2;
1070 if (sizeof(wchar_t) == 2)
1071 is_sharing = 1;
1072 }
1073 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001074 if (maxchar > MAX_UNICODE) {
1075 PyErr_SetString(PyExc_SystemError,
1076 "invalid maximum character passed to PyUnicode_New");
1077 return NULL;
1078 }
Victor Stinner8f825062012-04-27 13:55:39 +02001079 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001080 char_size = 4;
1081 if (sizeof(wchar_t) == 4)
1082 is_sharing = 1;
1083 }
1084
1085 /* Ensure we won't overflow the size. */
1086 if (size < 0) {
1087 PyErr_SetString(PyExc_SystemError,
1088 "Negative size passed to PyUnicode_New");
1089 return NULL;
1090 }
1091 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1092 return PyErr_NoMemory();
1093
1094 /* Duplicated allocation code from _PyObject_New() instead of a call to
1095 * PyObject_New() so we are able to allocate space for the object and
1096 * it's data buffer.
1097 */
1098 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1099 if (obj == NULL)
1100 return PyErr_NoMemory();
1101 obj = PyObject_INIT(obj, &PyUnicode_Type);
1102 if (obj == NULL)
1103 return NULL;
1104
1105 unicode = (PyCompactUnicodeObject *)obj;
1106 if (is_ascii)
1107 data = ((PyASCIIObject*)obj) + 1;
1108 else
1109 data = unicode + 1;
1110 _PyUnicode_LENGTH(unicode) = size;
1111 _PyUnicode_HASH(unicode) = -1;
1112 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001113 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114 _PyUnicode_STATE(unicode).compact = 1;
1115 _PyUnicode_STATE(unicode).ready = 1;
1116 _PyUnicode_STATE(unicode).ascii = is_ascii;
1117 if (is_ascii) {
1118 ((char*)data)[size] = 0;
1119 _PyUnicode_WSTR(unicode) = NULL;
1120 }
Victor Stinner8f825062012-04-27 13:55:39 +02001121 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001122 ((char*)data)[size] = 0;
1123 _PyUnicode_WSTR(unicode) = NULL;
1124 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001126 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 else {
1129 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001130 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001131 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001133 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134 ((Py_UCS4*)data)[size] = 0;
1135 if (is_sharing) {
1136 _PyUnicode_WSTR_LENGTH(unicode) = size;
1137 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1138 }
1139 else {
1140 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1141 _PyUnicode_WSTR(unicode) = NULL;
1142 }
1143 }
Victor Stinner8f825062012-04-27 13:55:39 +02001144#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001145 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001146#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001147 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148 return obj;
1149}
1150
1151#if SIZEOF_WCHAR_T == 2
1152/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1153 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001154 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155
1156 This function assumes that unicode can hold one more code point than wstr
1157 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001158static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001160 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161{
1162 const wchar_t *iter;
1163 Py_UCS4 *ucs4_out;
1164
Victor Stinner910337b2011-10-03 03:20:16 +02001165 assert(unicode != NULL);
1166 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1168 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1169
1170 for (iter = begin; iter < end; ) {
1171 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1172 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001173 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1174 && (iter+1) < end
1175 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 {
Victor Stinner551ac952011-11-29 22:58:13 +01001177 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 iter += 2;
1179 }
1180 else {
1181 *ucs4_out++ = *iter;
1182 iter++;
1183 }
1184 }
1185 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1186 _PyUnicode_GET_LENGTH(unicode)));
1187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188}
1189#endif
1190
Victor Stinnercd9950f2011-10-02 00:34:53 +02001191static int
Victor Stinner488fa492011-12-12 00:01:39 +01001192unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001193{
Victor Stinner488fa492011-12-12 00:01:39 +01001194 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001195 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001196 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001197 return -1;
1198 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001199 return 0;
1200}
1201
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001202static int
1203_copy_characters(PyObject *to, Py_ssize_t to_start,
1204 PyObject *from, Py_ssize_t from_start,
1205 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001206{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001207 unsigned int from_kind, to_kind;
1208 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinneree4544c2012-05-09 22:24:08 +02001210 assert(0 <= how_many);
1211 assert(0 <= from_start);
1212 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001213 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001214 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001215 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216
Victor Stinnerd3f08822012-05-29 12:57:52 +02001217 assert(PyUnicode_Check(to));
1218 assert(PyUnicode_IS_READY(to));
1219 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1220
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001221 if (how_many == 0)
1222 return 0;
1223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001225 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001227 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228
Victor Stinnerf1852262012-06-16 16:38:26 +02001229#ifdef Py_DEBUG
1230 if (!check_maxchar
1231 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1232 {
1233 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1234 Py_UCS4 ch;
1235 Py_ssize_t i;
1236 for (i=0; i < how_many; i++) {
1237 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1238 assert(ch <= to_maxchar);
1239 }
1240 }
1241#endif
1242
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001243 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001244 if (check_maxchar
1245 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1246 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 /* Writing Latin-1 characters into an ASCII string requires to
1248 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001249 Py_UCS4 max_char;
1250 max_char = ucs1lib_find_max_char(from_data,
1251 (Py_UCS1*)from_data + how_many);
1252 if (max_char >= 128)
1253 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001254 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001255 Py_MEMCPY((char*)to_data + to_kind * to_start,
1256 (char*)from_data + from_kind * from_start,
1257 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001258 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001259 else if (from_kind == PyUnicode_1BYTE_KIND
1260 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 {
1262 _PyUnicode_CONVERT_BYTES(
1263 Py_UCS1, Py_UCS2,
1264 PyUnicode_1BYTE_DATA(from) + from_start,
1265 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1266 PyUnicode_2BYTE_DATA(to) + to_start
1267 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001268 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001269 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001270 && to_kind == PyUnicode_4BYTE_KIND)
1271 {
1272 _PyUnicode_CONVERT_BYTES(
1273 Py_UCS1, Py_UCS4,
1274 PyUnicode_1BYTE_DATA(from) + from_start,
1275 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1276 PyUnicode_4BYTE_DATA(to) + to_start
1277 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001278 }
1279 else if (from_kind == PyUnicode_2BYTE_KIND
1280 && to_kind == PyUnicode_4BYTE_KIND)
1281 {
1282 _PyUnicode_CONVERT_BYTES(
1283 Py_UCS2, Py_UCS4,
1284 PyUnicode_2BYTE_DATA(from) + from_start,
1285 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1286 PyUnicode_4BYTE_DATA(to) + to_start
1287 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001288 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001289 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001290 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1291
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001292 if (!check_maxchar) {
1293 if (from_kind == PyUnicode_2BYTE_KIND
1294 && to_kind == PyUnicode_1BYTE_KIND)
1295 {
1296 _PyUnicode_CONVERT_BYTES(
1297 Py_UCS2, Py_UCS1,
1298 PyUnicode_2BYTE_DATA(from) + from_start,
1299 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1300 PyUnicode_1BYTE_DATA(to) + to_start
1301 );
1302 }
1303 else if (from_kind == PyUnicode_4BYTE_KIND
1304 && to_kind == PyUnicode_1BYTE_KIND)
1305 {
1306 _PyUnicode_CONVERT_BYTES(
1307 Py_UCS4, Py_UCS1,
1308 PyUnicode_4BYTE_DATA(from) + from_start,
1309 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1310 PyUnicode_1BYTE_DATA(to) + to_start
1311 );
1312 }
1313 else if (from_kind == PyUnicode_4BYTE_KIND
1314 && to_kind == PyUnicode_2BYTE_KIND)
1315 {
1316 _PyUnicode_CONVERT_BYTES(
1317 Py_UCS4, Py_UCS2,
1318 PyUnicode_4BYTE_DATA(from) + from_start,
1319 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1320 PyUnicode_2BYTE_DATA(to) + to_start
1321 );
1322 }
1323 else {
1324 assert(0);
1325 return -1;
1326 }
1327 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001328 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001329 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001330 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001331 Py_ssize_t i;
1332
Victor Stinnera0702ab2011-09-29 14:14:38 +02001333 for (i=0; i < how_many; i++) {
1334 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001335 if (ch > to_maxchar)
1336 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001337 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1338 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001339 }
1340 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341 return 0;
1342}
1343
Victor Stinnerd3f08822012-05-29 12:57:52 +02001344void
1345_PyUnicode_FastCopyCharacters(
1346 PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001348{
1349 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1350}
1351
1352Py_ssize_t
1353PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1354 PyObject *from, Py_ssize_t from_start,
1355 Py_ssize_t how_many)
1356{
1357 int err;
1358
1359 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1360 PyErr_BadInternalCall();
1361 return -1;
1362 }
1363
Benjamin Petersonbac79492012-01-14 13:34:47 -05001364 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001365 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001366 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001367 return -1;
1368
Victor Stinnerd3f08822012-05-29 12:57:52 +02001369 if (from_start < 0) {
1370 PyErr_SetString(PyExc_IndexError, "string index out of range");
1371 return -1;
1372 }
1373 if (to_start < 0) {
1374 PyErr_SetString(PyExc_IndexError, "string index out of range");
1375 return -1;
1376 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001377 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1378 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1379 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001380 "Cannot write %zi characters at %zi "
1381 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001382 how_many, to_start, PyUnicode_GET_LENGTH(to));
1383 return -1;
1384 }
1385
1386 if (how_many == 0)
1387 return 0;
1388
Victor Stinner488fa492011-12-12 00:01:39 +01001389 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001390 return -1;
1391
1392 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1393 if (err) {
1394 PyErr_Format(PyExc_SystemError,
1395 "Cannot copy %s characters "
1396 "into a string of %s characters",
1397 unicode_kind_name(from),
1398 unicode_kind_name(to));
1399 return -1;
1400 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001401 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402}
1403
Victor Stinner17222162011-09-28 22:15:37 +02001404/* Find the maximum code point and count the number of surrogate pairs so a
1405 correct string length can be computed before converting a string to UCS4.
1406 This function counts single surrogates as a character and not as a pair.
1407
1408 Return 0 on success, or -1 on error. */
1409static int
1410find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1411 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412{
1413 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001414 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
Victor Stinnerc53be962011-10-02 21:33:54 +02001416 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 *num_surrogates = 0;
1418 *maxchar = 0;
1419
1420 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001422 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1423 && (iter+1) < end
1424 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1425 {
1426 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1427 ++(*num_surrogates);
1428 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 }
1430 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001431#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001432 {
1433 ch = *iter;
1434 iter++;
1435 }
1436 if (ch > *maxchar) {
1437 *maxchar = ch;
1438 if (*maxchar > MAX_UNICODE) {
1439 PyErr_Format(PyExc_ValueError,
1440 "character U+%x is not in range [U+0000; U+10ffff]",
1441 ch);
1442 return -1;
1443 }
1444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 }
1446 return 0;
1447}
1448
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001449int
1450_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451{
1452 wchar_t *end;
1453 Py_UCS4 maxchar = 0;
1454 Py_ssize_t num_surrogates;
1455#if SIZEOF_WCHAR_T == 2
1456 Py_ssize_t length_wo_surrogates;
1457#endif
1458
Georg Brandl7597add2011-10-05 16:36:47 +02001459 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001460 strings were created using _PyObject_New() and where no canonical
1461 representation (the str field) has been set yet aka strings
1462 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001463 assert(_PyUnicode_CHECK(unicode));
1464 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001466 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001467 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001468 /* Actually, it should neither be interned nor be anything else: */
1469 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001472 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001473 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475
1476 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001477 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1478 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479 PyErr_NoMemory();
1480 return -1;
1481 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001482 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 _PyUnicode_WSTR(unicode), end,
1484 PyUnicode_1BYTE_DATA(unicode));
1485 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1486 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1487 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1488 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001489 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001490 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001491 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 }
1493 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001494 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001495 _PyUnicode_UTF8(unicode) = NULL;
1496 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 }
1498 PyObject_FREE(_PyUnicode_WSTR(unicode));
1499 _PyUnicode_WSTR(unicode) = NULL;
1500 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1501 }
1502 /* In this case we might have to convert down from 4-byte native
1503 wchar_t to 2-byte unicode. */
1504 else if (maxchar < 65536) {
1505 assert(num_surrogates == 0 &&
1506 "FindMaxCharAndNumSurrogatePairs() messed up");
1507
Victor Stinner506f5922011-09-28 22:34:18 +02001508#if SIZEOF_WCHAR_T == 2
1509 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001510 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001511 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1512 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1513 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001514 _PyUnicode_UTF8(unicode) = NULL;
1515 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001516#else
1517 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001518 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001519 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001520 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001521 PyErr_NoMemory();
1522 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 }
Victor Stinner506f5922011-09-28 22:34:18 +02001524 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1525 _PyUnicode_WSTR(unicode), end,
1526 PyUnicode_2BYTE_DATA(unicode));
1527 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1528 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1529 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001530 _PyUnicode_UTF8(unicode) = NULL;
1531 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001532 PyObject_FREE(_PyUnicode_WSTR(unicode));
1533 _PyUnicode_WSTR(unicode) = NULL;
1534 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1535#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 }
1537 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1538 else {
1539#if SIZEOF_WCHAR_T == 2
1540 /* in case the native representation is 2-bytes, we need to allocate a
1541 new normalized 4-byte version. */
1542 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001543 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1544 PyErr_NoMemory();
1545 return -1;
1546 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001547 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1548 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549 PyErr_NoMemory();
1550 return -1;
1551 }
1552 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1553 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001554 _PyUnicode_UTF8(unicode) = NULL;
1555 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001556 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1557 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001558 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 PyObject_FREE(_PyUnicode_WSTR(unicode));
1560 _PyUnicode_WSTR(unicode) = NULL;
1561 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1562#else
1563 assert(num_surrogates == 0);
1564
Victor Stinnerc3c74152011-10-02 20:39:55 +02001565 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001567 _PyUnicode_UTF8(unicode) = NULL;
1568 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1570#endif
1571 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1572 }
1573 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001574 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 return 0;
1576}
1577
Alexander Belopolsky40018472011-02-26 01:02:56 +00001578static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001579unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580{
Walter Dörwald16807132007-05-25 13:52:07 +00001581 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 case SSTATE_NOT_INTERNED:
1583 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001584
Benjamin Peterson29060642009-01-31 22:14:21 +00001585 case SSTATE_INTERNED_MORTAL:
1586 /* revive dead object temporarily for DelItem */
1587 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001588 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 Py_FatalError(
1590 "deletion of interned string failed");
1591 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001592
Benjamin Peterson29060642009-01-31 22:14:21 +00001593 case SSTATE_INTERNED_IMMORTAL:
1594 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001595
Benjamin Peterson29060642009-01-31 22:14:21 +00001596 default:
1597 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001598 }
1599
Victor Stinner03490912011-10-03 23:45:12 +02001600 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001601 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001602 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001603 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001604 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1605 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001606
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001607 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608}
1609
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001610#ifdef Py_DEBUG
1611static int
1612unicode_is_singleton(PyObject *unicode)
1613{
1614 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1615 if (unicode == unicode_empty)
1616 return 1;
1617 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1618 {
1619 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1620 if (ch < 256 && unicode_latin1[ch] == unicode)
1621 return 1;
1622 }
1623 return 0;
1624}
1625#endif
1626
Alexander Belopolsky40018472011-02-26 01:02:56 +00001627static int
Victor Stinner488fa492011-12-12 00:01:39 +01001628unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001629{
Victor Stinner488fa492011-12-12 00:01:39 +01001630 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631 if (Py_REFCNT(unicode) != 1)
1632 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001633 if (_PyUnicode_HASH(unicode) != -1)
1634 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001635 if (PyUnicode_CHECK_INTERNED(unicode))
1636 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001637 if (!PyUnicode_CheckExact(unicode))
1638 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001639#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001640 /* singleton refcount is greater than 1 */
1641 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001642#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001643 return 1;
1644}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001645
Victor Stinnerfe226c02011-10-03 03:52:20 +02001646static int
1647unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1648{
1649 PyObject *unicode;
1650 Py_ssize_t old_length;
1651
1652 assert(p_unicode != NULL);
1653 unicode = *p_unicode;
1654
1655 assert(unicode != NULL);
1656 assert(PyUnicode_Check(unicode));
1657 assert(0 <= length);
1658
Victor Stinner910337b2011-10-03 03:20:16 +02001659 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001660 old_length = PyUnicode_WSTR_LENGTH(unicode);
1661 else
1662 old_length = PyUnicode_GET_LENGTH(unicode);
1663 if (old_length == length)
1664 return 0;
1665
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001666 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001667 _Py_INCREF_UNICODE_EMPTY();
1668 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001670 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001671 return 0;
1672 }
1673
Victor Stinner488fa492011-12-12 00:01:39 +01001674 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001675 PyObject *copy = resize_copy(unicode, length);
1676 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001677 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001678 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001679 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001680 }
1681
Victor Stinnerfe226c02011-10-03 03:52:20 +02001682 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001683 PyObject *new_unicode = resize_compact(unicode, length);
1684 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001685 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001686 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001687 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001688 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001689 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001690}
1691
Alexander Belopolsky40018472011-02-26 01:02:56 +00001692int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001693PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001694{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001695 PyObject *unicode;
1696 if (p_unicode == NULL) {
1697 PyErr_BadInternalCall();
1698 return -1;
1699 }
1700 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001701 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001702 {
1703 PyErr_BadInternalCall();
1704 return -1;
1705 }
1706 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001707}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001708
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001709/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001710
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001711 WARNING: The function doesn't copy the terminating null character and
1712 doesn't check the maximum character (may write a latin1 character in an
1713 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001714static void
1715unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1716 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001717{
1718 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1719 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001720 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001721
1722 switch (kind) {
1723 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001724 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001725#ifdef Py_DEBUG
1726 if (PyUnicode_IS_ASCII(unicode)) {
1727 Py_UCS4 maxchar = ucs1lib_find_max_char(
1728 (const Py_UCS1*)str,
1729 (const Py_UCS1*)str + len);
1730 assert(maxchar < 128);
1731 }
1732#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001733 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001734 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001735 }
1736 case PyUnicode_2BYTE_KIND: {
1737 Py_UCS2 *start = (Py_UCS2 *)data + index;
1738 Py_UCS2 *ucs2 = start;
1739 assert(index <= PyUnicode_GET_LENGTH(unicode));
1740
Victor Stinner184252a2012-06-16 02:57:41 +02001741 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001742 *ucs2 = (Py_UCS2)*str;
1743
1744 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001745 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001746 }
1747 default: {
1748 Py_UCS4 *start = (Py_UCS4 *)data + index;
1749 Py_UCS4 *ucs4 = start;
1750 assert(kind == PyUnicode_4BYTE_KIND);
1751 assert(index <= PyUnicode_GET_LENGTH(unicode));
1752
Victor Stinner184252a2012-06-16 02:57:41 +02001753 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001754 *ucs4 = (Py_UCS4)*str;
1755
1756 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001757 }
1758 }
1759}
1760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761static PyObject*
1762get_latin1_char(unsigned char ch)
1763{
Victor Stinnera464fc12011-10-02 20:39:30 +02001764 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001766 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 if (!unicode)
1768 return NULL;
1769 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001770 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 unicode_latin1[ch] = unicode;
1772 }
1773 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001774 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775}
1776
Victor Stinner985a82a2014-01-03 12:53:47 +01001777static PyObject*
1778unicode_char(Py_UCS4 ch)
1779{
1780 PyObject *unicode;
1781
1782 assert(ch <= MAX_UNICODE);
1783
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001784 if (ch < 256)
1785 return get_latin1_char(ch);
1786
Victor Stinner985a82a2014-01-03 12:53:47 +01001787 unicode = PyUnicode_New(1, ch);
1788 if (unicode == NULL)
1789 return NULL;
1790 switch (PyUnicode_KIND(unicode)) {
1791 case PyUnicode_1BYTE_KIND:
1792 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1793 break;
1794 case PyUnicode_2BYTE_KIND:
1795 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1796 break;
1797 default:
1798 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1799 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1800 }
1801 assert(_PyUnicode_CheckConsistency(unicode, 1));
1802 return unicode;
1803}
1804
Alexander Belopolsky40018472011-02-26 01:02:56 +00001805PyObject *
1806PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001808 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 Py_UCS4 maxchar = 0;
1810 Py_ssize_t num_surrogates;
1811
1812 if (u == NULL)
1813 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001815 /* If the Unicode data is known at construction time, we can apply
1816 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001819 if (size == 0)
1820 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 /* Single character Unicode objects in the Latin-1 range are
1823 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001824 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 return get_latin1_char((unsigned char)*u);
1826
1827 /* If not empty and not single character, copy the Unicode data
1828 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001829 if (find_maxchar_surrogates(u, u + size,
1830 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 return NULL;
1832
Victor Stinner8faf8212011-12-08 22:14:11 +01001833 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001834 if (!unicode)
1835 return NULL;
1836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837 switch (PyUnicode_KIND(unicode)) {
1838 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001839 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1841 break;
1842 case PyUnicode_2BYTE_KIND:
1843#if Py_UNICODE_SIZE == 2
1844 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1845#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001846 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001847 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1848#endif
1849 break;
1850 case PyUnicode_4BYTE_KIND:
1851#if SIZEOF_WCHAR_T == 2
1852 /* This is the only case which has to process surrogates, thus
1853 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001854 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855#else
1856 assert(num_surrogates == 0);
1857 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1858#endif
1859 break;
1860 default:
1861 assert(0 && "Impossible state");
1862 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865}
1866
Alexander Belopolsky40018472011-02-26 01:02:56 +00001867PyObject *
1868PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001869{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001870 if (size < 0) {
1871 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001872 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001873 return NULL;
1874 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001875 if (u != NULL)
1876 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1877 else
1878 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001879}
1880
Alexander Belopolsky40018472011-02-26 01:02:56 +00001881PyObject *
1882PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001883{
1884 size_t size = strlen(u);
1885 if (size > PY_SSIZE_T_MAX) {
1886 PyErr_SetString(PyExc_OverflowError, "input too long");
1887 return NULL;
1888 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001889 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001890}
1891
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001892PyObject *
1893_PyUnicode_FromId(_Py_Identifier *id)
1894{
1895 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001896 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1897 strlen(id->string),
1898 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001899 if (!id->object)
1900 return NULL;
1901 PyUnicode_InternInPlace(&id->object);
1902 assert(!id->next);
1903 id->next = static_strings;
1904 static_strings = id;
1905 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001906 return id->object;
1907}
1908
1909void
1910_PyUnicode_ClearStaticStrings()
1911{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001912 _Py_Identifier *tmp, *s = static_strings;
1913 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001914 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001915 tmp = s->next;
1916 s->next = NULL;
1917 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001918 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001919 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001920}
1921
Benjamin Peterson0df54292012-03-26 14:50:32 -04001922/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001923
Victor Stinnerd3f08822012-05-29 12:57:52 +02001924PyObject*
1925_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001926{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001927 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001928 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001929 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001930#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001931 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001932#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001933 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001934 }
Victor Stinner785938e2011-12-11 20:09:03 +01001935 unicode = PyUnicode_New(size, 127);
1936 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001937 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001938 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1939 assert(_PyUnicode_CheckConsistency(unicode, 1));
1940 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001941}
1942
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001943static Py_UCS4
1944kind_maxchar_limit(unsigned int kind)
1945{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001946 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001947 case PyUnicode_1BYTE_KIND:
1948 return 0x80;
1949 case PyUnicode_2BYTE_KIND:
1950 return 0x100;
1951 case PyUnicode_4BYTE_KIND:
1952 return 0x10000;
1953 default:
1954 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001955 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001956 }
1957}
1958
Victor Stinnere6abb482012-05-02 01:15:40 +02001959Py_LOCAL_INLINE(Py_UCS4)
1960align_maxchar(Py_UCS4 maxchar)
1961{
1962 if (maxchar <= 127)
1963 return 127;
1964 else if (maxchar <= 255)
1965 return 255;
1966 else if (maxchar <= 65535)
1967 return 65535;
1968 else
1969 return MAX_UNICODE;
1970}
1971
Victor Stinner702c7342011-10-05 13:50:52 +02001972static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001973_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001974{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001976 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001977
Serhiy Storchaka678db842013-01-26 12:16:36 +02001978 if (size == 0)
1979 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001980 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001981 if (size == 1)
1982 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001983
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001984 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001985 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 if (!res)
1987 return NULL;
1988 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001989 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001991}
1992
Victor Stinnere57b1c02011-09-28 22:20:48 +02001993static PyObject*
1994_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995{
1996 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001997 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001998
Serhiy Storchaka678db842013-01-26 12:16:36 +02001999 if (size == 0)
2000 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002001 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002002 if (size == 1)
2003 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002004
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002005 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002006 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007 if (!res)
2008 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002009 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002011 else {
2012 _PyUnicode_CONVERT_BYTES(
2013 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2014 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002015 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 return res;
2017}
2018
Victor Stinnere57b1c02011-09-28 22:20:48 +02002019static PyObject*
2020_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021{
2022 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002023 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002024
Serhiy Storchaka678db842013-01-26 12:16:36 +02002025 if (size == 0)
2026 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002027 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002028 if (size == 1)
2029 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002030
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002031 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002032 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 if (!res)
2034 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002035 if (max_char < 256)
2036 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2037 PyUnicode_1BYTE_DATA(res));
2038 else if (max_char < 0x10000)
2039 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2040 PyUnicode_2BYTE_DATA(res));
2041 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002043 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002044 return res;
2045}
2046
2047PyObject*
2048PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2049{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002050 if (size < 0) {
2051 PyErr_SetString(PyExc_ValueError, "size must be positive");
2052 return NULL;
2053 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002054 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002056 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002058 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002060 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002061 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002062 PyErr_SetString(PyExc_SystemError, "invalid kind");
2063 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002065}
2066
Victor Stinnerece58de2012-04-23 23:36:38 +02002067Py_UCS4
2068_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2069{
2070 enum PyUnicode_Kind kind;
2071 void *startptr, *endptr;
2072
2073 assert(PyUnicode_IS_READY(unicode));
2074 assert(0 <= start);
2075 assert(end <= PyUnicode_GET_LENGTH(unicode));
2076 assert(start <= end);
2077
2078 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2079 return PyUnicode_MAX_CHAR_VALUE(unicode);
2080
2081 if (start == end)
2082 return 127;
2083
Victor Stinner94d558b2012-04-27 22:26:58 +02002084 if (PyUnicode_IS_ASCII(unicode))
2085 return 127;
2086
Victor Stinnerece58de2012-04-23 23:36:38 +02002087 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002088 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002089 endptr = (char *)startptr + end * kind;
2090 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002091 switch(kind) {
2092 case PyUnicode_1BYTE_KIND:
2093 return ucs1lib_find_max_char(startptr, endptr);
2094 case PyUnicode_2BYTE_KIND:
2095 return ucs2lib_find_max_char(startptr, endptr);
2096 case PyUnicode_4BYTE_KIND:
2097 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002098 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002099 assert(0);
2100 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002101 }
2102}
2103
Victor Stinner25a4b292011-10-06 12:31:55 +02002104/* Ensure that a string uses the most efficient storage, if it is not the
2105 case: create a new string with of the right kind. Write NULL into *p_unicode
2106 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002107static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002108unicode_adjust_maxchar(PyObject **p_unicode)
2109{
2110 PyObject *unicode, *copy;
2111 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002112 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002113 unsigned int kind;
2114
2115 assert(p_unicode != NULL);
2116 unicode = *p_unicode;
2117 assert(PyUnicode_IS_READY(unicode));
2118 if (PyUnicode_IS_ASCII(unicode))
2119 return;
2120
2121 len = PyUnicode_GET_LENGTH(unicode);
2122 kind = PyUnicode_KIND(unicode);
2123 if (kind == PyUnicode_1BYTE_KIND) {
2124 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002125 max_char = ucs1lib_find_max_char(u, u + len);
2126 if (max_char >= 128)
2127 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002128 }
2129 else if (kind == PyUnicode_2BYTE_KIND) {
2130 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002131 max_char = ucs2lib_find_max_char(u, u + len);
2132 if (max_char >= 256)
2133 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002134 }
2135 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002136 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002137 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002138 max_char = ucs4lib_find_max_char(u, u + len);
2139 if (max_char >= 0x10000)
2140 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002141 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002142 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002143 if (copy != NULL)
2144 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002145 Py_DECREF(unicode);
2146 *p_unicode = copy;
2147}
2148
Victor Stinner034f6cf2011-09-30 02:26:44 +02002149PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002150_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002151{
Victor Stinner87af4f22011-11-21 23:03:47 +01002152 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002153 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002154
Victor Stinner034f6cf2011-09-30 02:26:44 +02002155 if (!PyUnicode_Check(unicode)) {
2156 PyErr_BadInternalCall();
2157 return NULL;
2158 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002159 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002160 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002161
Victor Stinner87af4f22011-11-21 23:03:47 +01002162 length = PyUnicode_GET_LENGTH(unicode);
2163 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002164 if (!copy)
2165 return NULL;
2166 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2167
Victor Stinner87af4f22011-11-21 23:03:47 +01002168 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2169 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002170 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002171 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002172}
2173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174
Victor Stinnerbc603d12011-10-02 01:00:40 +02002175/* Widen Unicode objects to larger buffers. Don't write terminating null
2176 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177
2178void*
2179_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2180{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002181 Py_ssize_t len;
2182 void *result;
2183 unsigned int skind;
2184
Benjamin Petersonbac79492012-01-14 13:34:47 -05002185 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002186 return NULL;
2187
2188 len = PyUnicode_GET_LENGTH(s);
2189 skind = PyUnicode_KIND(s);
2190 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002191 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 return NULL;
2193 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002194 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002195 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002196 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002197 if (!result)
2198 return PyErr_NoMemory();
2199 assert(skind == PyUnicode_1BYTE_KIND);
2200 _PyUnicode_CONVERT_BYTES(
2201 Py_UCS1, Py_UCS2,
2202 PyUnicode_1BYTE_DATA(s),
2203 PyUnicode_1BYTE_DATA(s) + len,
2204 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002206 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002207 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002208 if (!result)
2209 return PyErr_NoMemory();
2210 if (skind == PyUnicode_2BYTE_KIND) {
2211 _PyUnicode_CONVERT_BYTES(
2212 Py_UCS2, Py_UCS4,
2213 PyUnicode_2BYTE_DATA(s),
2214 PyUnicode_2BYTE_DATA(s) + len,
2215 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002217 else {
2218 assert(skind == PyUnicode_1BYTE_KIND);
2219 _PyUnicode_CONVERT_BYTES(
2220 Py_UCS1, Py_UCS4,
2221 PyUnicode_1BYTE_DATA(s),
2222 PyUnicode_1BYTE_DATA(s) + len,
2223 result);
2224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002226 default:
2227 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 }
Victor Stinner01698042011-10-04 00:04:26 +02002229 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002230 return NULL;
2231}
2232
2233static Py_UCS4*
2234as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2235 int copy_null)
2236{
2237 int kind;
2238 void *data;
2239 Py_ssize_t len, targetlen;
2240 if (PyUnicode_READY(string) == -1)
2241 return NULL;
2242 kind = PyUnicode_KIND(string);
2243 data = PyUnicode_DATA(string);
2244 len = PyUnicode_GET_LENGTH(string);
2245 targetlen = len;
2246 if (copy_null)
2247 targetlen++;
2248 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002249 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002250 if (!target) {
2251 PyErr_NoMemory();
2252 return NULL;
2253 }
2254 }
2255 else {
2256 if (targetsize < targetlen) {
2257 PyErr_Format(PyExc_SystemError,
2258 "string is longer than the buffer");
2259 if (copy_null && 0 < targetsize)
2260 target[0] = 0;
2261 return NULL;
2262 }
2263 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002264 if (kind == PyUnicode_1BYTE_KIND) {
2265 Py_UCS1 *start = (Py_UCS1 *) data;
2266 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002268 else if (kind == PyUnicode_2BYTE_KIND) {
2269 Py_UCS2 *start = (Py_UCS2 *) data;
2270 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2271 }
2272 else {
2273 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 if (copy_null)
2277 target[len] = 0;
2278 return target;
2279}
2280
2281Py_UCS4*
2282PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2283 int copy_null)
2284{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002285 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 PyErr_BadInternalCall();
2287 return NULL;
2288 }
2289 return as_ucs4(string, target, targetsize, copy_null);
2290}
2291
2292Py_UCS4*
2293PyUnicode_AsUCS4Copy(PyObject *string)
2294{
2295 return as_ucs4(string, NULL, 0, 1);
2296}
2297
2298#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002299
Alexander Belopolsky40018472011-02-26 01:02:56 +00002300PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002301PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002304 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002305 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002306 PyErr_BadInternalCall();
2307 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002308 }
2309
Martin v. Löwis790465f2008-04-05 20:41:37 +00002310 if (size == -1) {
2311 size = wcslen(w);
2312 }
2313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002315}
2316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002318
Victor Stinner15a11362012-10-06 23:48:20 +02002319/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002320 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2321 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2322#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002323
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002324static int
2325unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2326 Py_ssize_t width, Py_ssize_t precision)
2327{
2328 Py_ssize_t length, fill, arglen;
2329 Py_UCS4 maxchar;
2330
2331 if (PyUnicode_READY(str) == -1)
2332 return -1;
2333
2334 length = PyUnicode_GET_LENGTH(str);
2335 if ((precision == -1 || precision >= length)
2336 && width <= length)
2337 return _PyUnicodeWriter_WriteStr(writer, str);
2338
2339 if (precision != -1)
2340 length = Py_MIN(precision, length);
2341
2342 arglen = Py_MAX(length, width);
2343 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2344 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2345 else
2346 maxchar = writer->maxchar;
2347
2348 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2349 return -1;
2350
2351 if (width > length) {
2352 fill = width - length;
2353 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2354 return -1;
2355 writer->pos += fill;
2356 }
2357
2358 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2359 str, 0, length);
2360 writer->pos += length;
2361 return 0;
2362}
2363
2364static int
2365unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2366 Py_ssize_t width, Py_ssize_t precision)
2367{
2368 /* UTF-8 */
2369 Py_ssize_t length;
2370 PyObject *unicode;
2371 int res;
2372
2373 length = strlen(str);
2374 if (precision != -1)
2375 length = Py_MIN(length, precision);
2376 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2377 if (unicode == NULL)
2378 return -1;
2379
2380 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2381 Py_DECREF(unicode);
2382 return res;
2383}
2384
Victor Stinner96865452011-03-01 23:44:09 +00002385static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002386unicode_fromformat_arg(_PyUnicodeWriter *writer,
2387 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002388{
Victor Stinnere215d962012-10-06 23:03:36 +02002389 const char *p;
2390 Py_ssize_t len;
2391 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002392 Py_ssize_t width;
2393 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002394 int longflag;
2395 int longlongflag;
2396 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002397 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002398
2399 p = f;
2400 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002401 zeropad = 0;
2402 if (*f == '0') {
2403 zeropad = 1;
2404 f++;
2405 }
Victor Stinner96865452011-03-01 23:44:09 +00002406
2407 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002408 width = -1;
2409 if (Py_ISDIGIT((unsigned)*f)) {
2410 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002411 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002412 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002413 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002414 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002415 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002416 return NULL;
2417 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002419 f++;
2420 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002421 }
2422 precision = -1;
2423 if (*f == '.') {
2424 f++;
2425 if (Py_ISDIGIT((unsigned)*f)) {
2426 precision = (*f - '0');
2427 f++;
2428 while (Py_ISDIGIT((unsigned)*f)) {
2429 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2430 PyErr_SetString(PyExc_ValueError,
2431 "precision too big");
2432 return NULL;
2433 }
2434 precision = (precision * 10) + (*f - '0');
2435 f++;
2436 }
2437 }
Victor Stinner96865452011-03-01 23:44:09 +00002438 if (*f == '%') {
2439 /* "%.3%s" => f points to "3" */
2440 f--;
2441 }
2442 }
2443 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002444 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002445 f--;
2446 }
Victor Stinner96865452011-03-01 23:44:09 +00002447
2448 /* Handle %ld, %lu, %lld and %llu. */
2449 longflag = 0;
2450 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002451 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002452 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002453 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002454 longflag = 1;
2455 ++f;
2456 }
2457#ifdef HAVE_LONG_LONG
2458 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002459 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002460 longlongflag = 1;
2461 f += 2;
2462 }
2463#endif
2464 }
2465 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002466 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002467 size_tflag = 1;
2468 ++f;
2469 }
Victor Stinnere215d962012-10-06 23:03:36 +02002470
2471 if (f[1] == '\0')
2472 writer->overallocate = 0;
2473
2474 switch (*f) {
2475 case 'c':
2476 {
2477 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002478 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002479 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002480 "character argument not in range(0x110000)");
2481 return NULL;
2482 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002483 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002484 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002485 break;
2486 }
2487
2488 case 'i':
2489 case 'd':
2490 case 'u':
2491 case 'x':
2492 {
2493 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002494 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002495 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002496
2497 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002498 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002499 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002500 va_arg(*vargs, unsigned long));
2501#ifdef HAVE_LONG_LONG
2502 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002503 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002504 va_arg(*vargs, unsigned PY_LONG_LONG));
2505#endif
2506 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002507 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002508 va_arg(*vargs, size_t));
2509 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002510 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002511 va_arg(*vargs, unsigned int));
2512 }
2513 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002514 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002515 }
2516 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002517 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002518 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002519 va_arg(*vargs, long));
2520#ifdef HAVE_LONG_LONG
2521 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002522 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002523 va_arg(*vargs, PY_LONG_LONG));
2524#endif
2525 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002526 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002527 va_arg(*vargs, Py_ssize_t));
2528 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002529 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002530 va_arg(*vargs, int));
2531 }
2532 assert(len >= 0);
2533
Victor Stinnere215d962012-10-06 23:03:36 +02002534 if (precision < len)
2535 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002536
2537 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002538 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2539 return NULL;
2540
Victor Stinnere215d962012-10-06 23:03:36 +02002541 if (width > precision) {
2542 Py_UCS4 fillchar;
2543 fill = width - precision;
2544 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002545 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2546 return NULL;
2547 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002548 }
Victor Stinner15a11362012-10-06 23:48:20 +02002549 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002550 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002551 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2552 return NULL;
2553 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002554 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002555
Victor Stinner4a587072013-11-19 12:54:53 +01002556 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2557 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002558 break;
2559 }
2560
2561 case 'p':
2562 {
2563 char number[MAX_LONG_LONG_CHARS];
2564
2565 len = sprintf(number, "%p", va_arg(*vargs, void*));
2566 assert(len >= 0);
2567
2568 /* %p is ill-defined: ensure leading 0x. */
2569 if (number[1] == 'X')
2570 number[1] = 'x';
2571 else if (number[1] != 'x') {
2572 memmove(number + 2, number,
2573 strlen(number) + 1);
2574 number[0] = '0';
2575 number[1] = 'x';
2576 len += 2;
2577 }
2578
Victor Stinner4a587072013-11-19 12:54:53 +01002579 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002580 return NULL;
2581 break;
2582 }
2583
2584 case 's':
2585 {
2586 /* UTF-8 */
2587 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002588 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002589 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002590 break;
2591 }
2592
2593 case 'U':
2594 {
2595 PyObject *obj = va_arg(*vargs, PyObject *);
2596 assert(obj && _PyUnicode_CHECK(obj));
2597
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002598 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002599 return NULL;
2600 break;
2601 }
2602
2603 case 'V':
2604 {
2605 PyObject *obj = va_arg(*vargs, PyObject *);
2606 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002607 if (obj) {
2608 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002609 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002610 return NULL;
2611 }
2612 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002613 assert(str != NULL);
2614 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002615 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002616 }
2617 break;
2618 }
2619
2620 case 'S':
2621 {
2622 PyObject *obj = va_arg(*vargs, PyObject *);
2623 PyObject *str;
2624 assert(obj);
2625 str = PyObject_Str(obj);
2626 if (!str)
2627 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002629 Py_DECREF(str);
2630 return NULL;
2631 }
2632 Py_DECREF(str);
2633 break;
2634 }
2635
2636 case 'R':
2637 {
2638 PyObject *obj = va_arg(*vargs, PyObject *);
2639 PyObject *repr;
2640 assert(obj);
2641 repr = PyObject_Repr(obj);
2642 if (!repr)
2643 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002644 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002645 Py_DECREF(repr);
2646 return NULL;
2647 }
2648 Py_DECREF(repr);
2649 break;
2650 }
2651
2652 case 'A':
2653 {
2654 PyObject *obj = va_arg(*vargs, PyObject *);
2655 PyObject *ascii;
2656 assert(obj);
2657 ascii = PyObject_ASCII(obj);
2658 if (!ascii)
2659 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002660 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002661 Py_DECREF(ascii);
2662 return NULL;
2663 }
2664 Py_DECREF(ascii);
2665 break;
2666 }
2667
2668 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002669 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002670 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002671 break;
2672
2673 default:
2674 /* if we stumble upon an unknown formatting code, copy the rest
2675 of the format string to the output string. (we cannot just
2676 skip the code, since there's no way to know what's in the
2677 argument list) */
2678 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002679 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002680 return NULL;
2681 f = p+len;
2682 return f;
2683 }
2684
2685 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002686 return f;
2687}
2688
Walter Dörwaldd2034312007-05-18 16:29:38 +00002689PyObject *
2690PyUnicode_FromFormatV(const char *format, va_list vargs)
2691{
Victor Stinnere215d962012-10-06 23:03:36 +02002692 va_list vargs2;
2693 const char *f;
2694 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002695
Victor Stinner8f674cc2013-04-17 23:02:17 +02002696 _PyUnicodeWriter_Init(&writer);
2697 writer.min_length = strlen(format) + 100;
2698 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002699
2700 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2701 Copy it to be able to pass a reference to a subfunction. */
2702 Py_VA_COPY(vargs2, vargs);
2703
2704 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002706 f = unicode_fromformat_arg(&writer, f, &vargs2);
2707 if (f == NULL)
2708 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002710 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002711 const char *p;
2712 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713
Victor Stinnere215d962012-10-06 23:03:36 +02002714 p = f;
2715 do
2716 {
2717 if ((unsigned char)*p > 127) {
2718 PyErr_Format(PyExc_ValueError,
2719 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2720 "string, got a non-ASCII byte: 0x%02x",
2721 (unsigned char)*p);
2722 return NULL;
2723 }
2724 p++;
2725 }
2726 while (*p != '\0' && *p != '%');
2727 len = p - f;
2728
2729 if (*p == '\0')
2730 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002731
2732 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002733 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002734
2735 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 }
Victor Stinnere215d962012-10-06 23:03:36 +02002738 return _PyUnicodeWriter_Finish(&writer);
2739
2740 fail:
2741 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002742 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002743}
2744
Walter Dörwaldd2034312007-05-18 16:29:38 +00002745PyObject *
2746PyUnicode_FromFormat(const char *format, ...)
2747{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002748 PyObject* ret;
2749 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002750
2751#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002752 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002753#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002754 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002755#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002756 ret = PyUnicode_FromFormatV(format, vargs);
2757 va_end(vargs);
2758 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002759}
2760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761#ifdef HAVE_WCHAR_H
2762
Victor Stinner5593d8a2010-10-02 11:11:27 +00002763/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2764 convert a Unicode object to a wide character string.
2765
Victor Stinnerd88d9832011-09-06 02:00:05 +02002766 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002767 character) required to convert the unicode object. Ignore size argument.
2768
Victor Stinnerd88d9832011-09-06 02:00:05 +02002769 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002770 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002771 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002772static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002773unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002774 wchar_t *w,
2775 Py_ssize_t size)
2776{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002777 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 const wchar_t *wstr;
2779
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002780 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781 if (wstr == NULL)
2782 return -1;
2783
Victor Stinner5593d8a2010-10-02 11:11:27 +00002784 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785 if (size > res)
2786 size = res + 1;
2787 else
2788 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002790 return res;
2791 }
2792 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002794}
2795
2796Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002797PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002798 wchar_t *w,
2799 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002800{
2801 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002802 PyErr_BadInternalCall();
2803 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002805 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806}
2807
Victor Stinner137c34c2010-09-29 10:25:54 +00002808wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002809PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002810 Py_ssize_t *size)
2811{
2812 wchar_t* buffer;
2813 Py_ssize_t buflen;
2814
2815 if (unicode == NULL) {
2816 PyErr_BadInternalCall();
2817 return NULL;
2818 }
2819
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002820 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 if (buflen == -1)
2822 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002823 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002824 if (buffer == NULL) {
2825 PyErr_NoMemory();
2826 return NULL;
2827 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002828 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002829 if (buflen == -1) {
2830 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002831 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002832 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002833 if (size != NULL)
2834 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002835 return buffer;
2836}
2837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002838#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839
Alexander Belopolsky40018472011-02-26 01:02:56 +00002840PyObject *
2841PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002842{
Victor Stinner8faf8212011-12-08 22:14:11 +01002843 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002844 PyErr_SetString(PyExc_ValueError,
2845 "chr() arg not in range(0x110000)");
2846 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002847 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002848
Victor Stinner985a82a2014-01-03 12:53:47 +01002849 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002850}
2851
Alexander Belopolsky40018472011-02-26 01:02:56 +00002852PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002853PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002855 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002856 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002857 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002858 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002859 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002860 Py_INCREF(obj);
2861 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002862 }
2863 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002864 /* For a Unicode subtype that's not a Unicode object,
2865 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002866 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002867 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002868 PyErr_Format(PyExc_TypeError,
2869 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002870 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002871 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002872}
2873
Alexander Belopolsky40018472011-02-26 01:02:56 +00002874PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002875PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002876 const char *encoding,
2877 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002878{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002880 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002881
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002883 PyErr_BadInternalCall();
2884 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002886
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002887 /* Decoding bytes objects is the most common case and should be fast */
2888 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002889 if (PyBytes_GET_SIZE(obj) == 0)
2890 _Py_RETURN_UNICODE_EMPTY();
2891 v = PyUnicode_Decode(
2892 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2893 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002894 return v;
2895 }
2896
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002897 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002898 PyErr_SetString(PyExc_TypeError,
2899 "decoding str is not supported");
2900 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002901 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002902
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002903 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2904 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2905 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002906 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002907 Py_TYPE(obj)->tp_name);
2908 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002909 }
Tim Petersced69f82003-09-16 20:30:58 +00002910
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002911 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002912 PyBuffer_Release(&buffer);
2913 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002915
Serhiy Storchaka05997252013-01-26 12:14:02 +02002916 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002917 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002918 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919}
2920
Victor Stinner600d3be2010-06-10 12:00:55 +00002921/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002922 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2923 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002924int
2925_Py_normalize_encoding(const char *encoding,
2926 char *lower,
2927 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002929 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002930 char *l;
2931 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002932
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002933 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002934 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002935 if (lower_len < 6)
2936 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002937 strcpy(lower, "utf-8");
2938 return 1;
2939 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002940 e = encoding;
2941 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002942 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002943 while (*e) {
2944 if (l == l_end)
2945 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002946 if (Py_ISUPPER(*e)) {
2947 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002948 }
2949 else if (*e == '_') {
2950 *l++ = '-';
2951 e++;
2952 }
2953 else {
2954 *l++ = *e++;
2955 }
2956 }
2957 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002958 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002959}
2960
Alexander Belopolsky40018472011-02-26 01:02:56 +00002961PyObject *
2962PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002963 Py_ssize_t size,
2964 const char *encoding,
2965 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002966{
2967 PyObject *buffer = NULL, *unicode;
2968 Py_buffer info;
2969 char lower[11]; /* Enough for any encoding shortcut */
2970
Fred Drakee4315f52000-05-09 19:53:39 +00002971 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002972 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002973 if ((strcmp(lower, "utf-8") == 0) ||
2974 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002975 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002976 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002977 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01002978 (strcmp(lower, "iso-8859-1") == 0) ||
2979 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002980 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002981#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002982 else if (strcmp(lower, "mbcs") == 0)
2983 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002984#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002985 else if (strcmp(lower, "ascii") == 0)
2986 return PyUnicode_DecodeASCII(s, size, errors);
2987 else if (strcmp(lower, "utf-16") == 0)
2988 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2989 else if (strcmp(lower, "utf-32") == 0)
2990 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992
2993 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002994 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002995 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002996 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002997 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 if (buffer == NULL)
2999 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003000 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 if (unicode == NULL)
3002 goto onError;
3003 if (!PyUnicode_Check(unicode)) {
3004 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003005 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3006 "use codecs.decode() to decode to arbitrary types",
3007 encoding,
3008 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009 Py_DECREF(unicode);
3010 goto onError;
3011 }
3012 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003013 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003014
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016 Py_XDECREF(buffer);
3017 return NULL;
3018}
3019
Alexander Belopolsky40018472011-02-26 01:02:56 +00003020PyObject *
3021PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003022 const char *encoding,
3023 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003024{
3025 PyObject *v;
3026
3027 if (!PyUnicode_Check(unicode)) {
3028 PyErr_BadArgument();
3029 goto onError;
3030 }
3031
3032 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003033 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003034
3035 /* Decode via the codec registry */
3036 v = PyCodec_Decode(unicode, encoding, errors);
3037 if (v == NULL)
3038 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003039 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003040
Benjamin Peterson29060642009-01-31 22:14:21 +00003041 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003042 return NULL;
3043}
3044
Alexander Belopolsky40018472011-02-26 01:02:56 +00003045PyObject *
3046PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003047 const char *encoding,
3048 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003049{
3050 PyObject *v;
3051
3052 if (!PyUnicode_Check(unicode)) {
3053 PyErr_BadArgument();
3054 goto onError;
3055 }
3056
3057 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003059
3060 /* Decode via the codec registry */
3061 v = PyCodec_Decode(unicode, encoding, errors);
3062 if (v == NULL)
3063 goto onError;
3064 if (!PyUnicode_Check(v)) {
3065 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003066 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3067 "use codecs.decode() to decode to arbitrary types",
3068 encoding,
3069 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003070 Py_DECREF(v);
3071 goto onError;
3072 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003073 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003074
Benjamin Peterson29060642009-01-31 22:14:21 +00003075 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003076 return NULL;
3077}
3078
Alexander Belopolsky40018472011-02-26 01:02:56 +00003079PyObject *
3080PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003081 Py_ssize_t size,
3082 const char *encoding,
3083 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084{
3085 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003086
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 unicode = PyUnicode_FromUnicode(s, size);
3088 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003089 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3091 Py_DECREF(unicode);
3092 return v;
3093}
3094
Alexander Belopolsky40018472011-02-26 01:02:56 +00003095PyObject *
3096PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003097 const char *encoding,
3098 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003099{
3100 PyObject *v;
3101
3102 if (!PyUnicode_Check(unicode)) {
3103 PyErr_BadArgument();
3104 goto onError;
3105 }
3106
3107 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003108 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003109
3110 /* Encode via the codec registry */
3111 v = PyCodec_Encode(unicode, encoding, errors);
3112 if (v == NULL)
3113 goto onError;
3114 return v;
3115
Benjamin Peterson29060642009-01-31 22:14:21 +00003116 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003117 return NULL;
3118}
3119
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003120static size_t
3121wcstombs_errorpos(const wchar_t *wstr)
3122{
3123 size_t len;
3124#if SIZEOF_WCHAR_T == 2
3125 wchar_t buf[3];
3126#else
3127 wchar_t buf[2];
3128#endif
3129 char outbuf[MB_LEN_MAX];
3130 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003131
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003132#if SIZEOF_WCHAR_T == 2
3133 buf[2] = 0;
3134#else
3135 buf[1] = 0;
3136#endif
3137 start = wstr;
3138 while (*wstr != L'\0')
3139 {
3140 previous = wstr;
3141#if SIZEOF_WCHAR_T == 2
3142 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3143 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3144 {
3145 buf[0] = wstr[0];
3146 buf[1] = wstr[1];
3147 wstr += 2;
3148 }
3149 else {
3150 buf[0] = *wstr;
3151 buf[1] = 0;
3152 wstr++;
3153 }
3154#else
3155 buf[0] = *wstr;
3156 wstr++;
3157#endif
3158 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003159 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003160 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003161 }
3162
3163 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003164 return 0;
3165}
3166
Victor Stinner1b579672011-12-17 05:47:23 +01003167static int
3168locale_error_handler(const char *errors, int *surrogateescape)
3169{
3170 if (errors == NULL) {
3171 *surrogateescape = 0;
3172 return 0;
3173 }
3174
3175 if (strcmp(errors, "strict") == 0) {
3176 *surrogateescape = 0;
3177 return 0;
3178 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003179 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003180 *surrogateescape = 1;
3181 return 0;
3182 }
3183 PyErr_Format(PyExc_ValueError,
3184 "only 'strict' and 'surrogateescape' error handlers "
3185 "are supported, not '%s'",
3186 errors);
3187 return -1;
3188}
3189
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003190PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003191PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003192{
3193 Py_ssize_t wlen, wlen2;
3194 wchar_t *wstr;
3195 PyObject *bytes = NULL;
3196 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003197 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003198 PyObject *exc;
3199 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003200 int surrogateescape;
3201
3202 if (locale_error_handler(errors, &surrogateescape) < 0)
3203 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003204
3205 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3206 if (wstr == NULL)
3207 return NULL;
3208
3209 wlen2 = wcslen(wstr);
3210 if (wlen2 != wlen) {
3211 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003212 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003213 return NULL;
3214 }
3215
3216 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003217 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003218 char *str;
3219
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003220 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003221 if (str == NULL) {
3222 if (error_pos == (size_t)-1) {
3223 PyErr_NoMemory();
3224 PyMem_Free(wstr);
3225 return NULL;
3226 }
3227 else {
3228 goto encode_error;
3229 }
3230 }
3231 PyMem_Free(wstr);
3232
3233 bytes = PyBytes_FromString(str);
3234 PyMem_Free(str);
3235 }
3236 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003237 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003238 size_t len, len2;
3239
3240 len = wcstombs(NULL, wstr, 0);
3241 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003242 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003243 goto encode_error;
3244 }
3245
3246 bytes = PyBytes_FromStringAndSize(NULL, len);
3247 if (bytes == NULL) {
3248 PyMem_Free(wstr);
3249 return NULL;
3250 }
3251
3252 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3253 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003254 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003255 goto encode_error;
3256 }
3257 PyMem_Free(wstr);
3258 }
3259 return bytes;
3260
3261encode_error:
3262 errmsg = strerror(errno);
3263 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003264
3265 if (error_pos == (size_t)-1)
3266 error_pos = wcstombs_errorpos(wstr);
3267
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003268 PyMem_Free(wstr);
3269 Py_XDECREF(bytes);
3270
Victor Stinner2f197072011-12-17 07:08:30 +01003271 if (errmsg != NULL) {
3272 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003273 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003274 if (wstr != NULL) {
3275 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003276 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003277 } else
3278 errmsg = NULL;
3279 }
3280 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003281 reason = PyUnicode_FromString(
3282 "wcstombs() encountered an unencodable "
3283 "wide character");
3284 if (reason == NULL)
3285 return NULL;
3286
3287 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3288 "locale", unicode,
3289 (Py_ssize_t)error_pos,
3290 (Py_ssize_t)(error_pos+1),
3291 reason);
3292 Py_DECREF(reason);
3293 if (exc != NULL) {
3294 PyCodec_StrictErrors(exc);
3295 Py_XDECREF(exc);
3296 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003297 return NULL;
3298}
3299
Victor Stinnerad158722010-10-27 00:25:46 +00003300PyObject *
3301PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003302{
Victor Stinner99b95382011-07-04 14:23:54 +02003303#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003304 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003305#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003306 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003307#else
Victor Stinner793b5312011-04-27 00:24:21 +02003308 PyInterpreterState *interp = PyThreadState_GET()->interp;
3309 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3310 cannot use it to encode and decode filenames before it is loaded. Load
3311 the Python codec requires to encode at least its own filename. Use the C
3312 version of the locale codec until the codec registry is initialized and
3313 the Python codec is loaded.
3314
3315 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3316 cannot only rely on it: check also interp->fscodec_initialized for
3317 subinterpreters. */
3318 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003319 return PyUnicode_AsEncodedString(unicode,
3320 Py_FileSystemDefaultEncoding,
3321 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003322 }
3323 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003324 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003325 }
Victor Stinnerad158722010-10-27 00:25:46 +00003326#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003327}
3328
Alexander Belopolsky40018472011-02-26 01:02:56 +00003329PyObject *
3330PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003331 const char *encoding,
3332 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333{
3334 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003335 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003336
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 if (!PyUnicode_Check(unicode)) {
3338 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 }
Fred Drakee4315f52000-05-09 19:53:39 +00003341
Fred Drakee4315f52000-05-09 19:53:39 +00003342 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003343 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003344 if ((strcmp(lower, "utf-8") == 0) ||
3345 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003346 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003347 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003348 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003349 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003350 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003351 }
Victor Stinner37296e82010-06-10 13:36:23 +00003352 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003353 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003354 (strcmp(lower, "iso-8859-1") == 0) ||
3355 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003356 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003357#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003358 else if (strcmp(lower, "mbcs") == 0)
3359 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003360#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003361 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003362 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003363 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364
3365 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003366 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003367 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003368 return NULL;
3369
3370 /* The normal path */
3371 if (PyBytes_Check(v))
3372 return v;
3373
3374 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003375 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003376 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003377 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003378
3379 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003380 "encoder %s returned bytearray instead of bytes; "
3381 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003382 encoding);
3383 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003384 Py_DECREF(v);
3385 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003386 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003387
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003388 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3389 Py_DECREF(v);
3390 return b;
3391 }
3392
3393 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003394 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3395 "use codecs.encode() to encode to arbitrary types",
3396 encoding,
3397 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003398 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003399 return NULL;
3400}
3401
Alexander Belopolsky40018472011-02-26 01:02:56 +00003402PyObject *
3403PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003404 const char *encoding,
3405 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003406{
3407 PyObject *v;
3408
3409 if (!PyUnicode_Check(unicode)) {
3410 PyErr_BadArgument();
3411 goto onError;
3412 }
3413
3414 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003415 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003416
3417 /* Encode via the codec registry */
3418 v = PyCodec_Encode(unicode, encoding, errors);
3419 if (v == NULL)
3420 goto onError;
3421 if (!PyUnicode_Check(v)) {
3422 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003423 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3424 "use codecs.encode() to encode to arbitrary types",
3425 encoding,
3426 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003427 Py_DECREF(v);
3428 goto onError;
3429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003431
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 return NULL;
3434}
3435
Victor Stinner2f197072011-12-17 07:08:30 +01003436static size_t
3437mbstowcs_errorpos(const char *str, size_t len)
3438{
3439#ifdef HAVE_MBRTOWC
3440 const char *start = str;
3441 mbstate_t mbs;
3442 size_t converted;
3443 wchar_t ch;
3444
3445 memset(&mbs, 0, sizeof mbs);
3446 while (len)
3447 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003448 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003449 if (converted == 0)
3450 /* Reached end of string */
3451 break;
3452 if (converted == (size_t)-1 || converted == (size_t)-2) {
3453 /* Conversion error or incomplete character */
3454 return str - start;
3455 }
3456 else {
3457 str += converted;
3458 len -= converted;
3459 }
3460 }
3461 /* failed to find the undecodable byte sequence */
3462 return 0;
3463#endif
3464 return 0;
3465}
3466
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003467PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003468PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003469 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003470{
3471 wchar_t smallbuf[256];
3472 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3473 wchar_t *wstr;
3474 size_t wlen, wlen2;
3475 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003476 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003477 size_t error_pos;
3478 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003479 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3480 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003481
3482 if (locale_error_handler(errors, &surrogateescape) < 0)
3483 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003484
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003485 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3486 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003487 return NULL;
3488 }
3489
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003490 if (surrogateescape) {
3491 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003492 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003493 if (wstr == NULL) {
3494 if (wlen == (size_t)-1)
3495 PyErr_NoMemory();
3496 else
3497 PyErr_SetFromErrno(PyExc_OSError);
3498 return NULL;
3499 }
3500
3501 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003502 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003503 }
3504 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003505 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003506#ifndef HAVE_BROKEN_MBSTOWCS
3507 wlen = mbstowcs(NULL, str, 0);
3508#else
3509 wlen = len;
3510#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003511 if (wlen == (size_t)-1)
3512 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003513 if (wlen+1 <= smallbuf_len) {
3514 wstr = smallbuf;
3515 }
3516 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003517 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003518 if (!wstr)
3519 return PyErr_NoMemory();
3520 }
3521
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003522 wlen2 = mbstowcs(wstr, str, wlen+1);
3523 if (wlen2 == (size_t)-1) {
3524 if (wstr != smallbuf)
3525 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003526 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003527 }
3528#ifdef HAVE_BROKEN_MBSTOWCS
3529 assert(wlen2 == wlen);
3530#endif
3531 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3532 if (wstr != smallbuf)
3533 PyMem_Free(wstr);
3534 }
3535 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003536
3537decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003538 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003539 errmsg = strerror(errno);
3540 assert(errmsg != NULL);
3541
3542 error_pos = mbstowcs_errorpos(str, len);
3543 if (errmsg != NULL) {
3544 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003545 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003546 if (wstr != NULL) {
3547 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003548 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003549 }
Victor Stinner2f197072011-12-17 07:08:30 +01003550 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003551 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003552 reason = PyUnicode_FromString(
3553 "mbstowcs() encountered an invalid multibyte sequence");
3554 if (reason == NULL)
3555 return NULL;
3556
3557 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3558 "locale", str, len,
3559 (Py_ssize_t)error_pos,
3560 (Py_ssize_t)(error_pos+1),
3561 reason);
3562 Py_DECREF(reason);
3563 if (exc != NULL) {
3564 PyCodec_StrictErrors(exc);
3565 Py_XDECREF(exc);
3566 }
3567 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003568}
3569
3570PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003571PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003572{
3573 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003574 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003575}
3576
3577
3578PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003579PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003580 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003581 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3582}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003583
Christian Heimes5894ba72007-11-04 11:43:14 +00003584PyObject*
3585PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3586{
Victor Stinner99b95382011-07-04 14:23:54 +02003587#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003588 return PyUnicode_DecodeMBCS(s, size, NULL);
3589#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003590 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003591#else
Victor Stinner793b5312011-04-27 00:24:21 +02003592 PyInterpreterState *interp = PyThreadState_GET()->interp;
3593 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3594 cannot use it to encode and decode filenames before it is loaded. Load
3595 the Python codec requires to encode at least its own filename. Use the C
3596 version of the locale codec until the codec registry is initialized and
3597 the Python codec is loaded.
3598
3599 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3600 cannot only rely on it: check also interp->fscodec_initialized for
3601 subinterpreters. */
3602 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003603 return PyUnicode_Decode(s, size,
3604 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003605 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003606 }
3607 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003608 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003609 }
Victor Stinnerad158722010-10-27 00:25:46 +00003610#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003611}
3612
Martin v. Löwis011e8422009-05-05 04:43:17 +00003613
3614int
3615PyUnicode_FSConverter(PyObject* arg, void* addr)
3616{
3617 PyObject *output = NULL;
3618 Py_ssize_t size;
3619 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003620 if (arg == NULL) {
3621 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003622 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003623 return 1;
3624 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003625 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003626 output = arg;
3627 Py_INCREF(output);
3628 }
3629 else {
3630 arg = PyUnicode_FromObject(arg);
3631 if (!arg)
3632 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003633 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003634 Py_DECREF(arg);
3635 if (!output)
3636 return 0;
3637 if (!PyBytes_Check(output)) {
3638 Py_DECREF(output);
3639 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3640 return 0;
3641 }
3642 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003643 size = PyBytes_GET_SIZE(output);
3644 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003645 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003646 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003647 Py_DECREF(output);
3648 return 0;
3649 }
3650 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003651 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003652}
3653
3654
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003655int
3656PyUnicode_FSDecoder(PyObject* arg, void* addr)
3657{
3658 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003659 if (arg == NULL) {
3660 Py_DECREF(*(PyObject**)addr);
3661 return 1;
3662 }
3663 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003664 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003665 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003666 output = arg;
3667 Py_INCREF(output);
3668 }
3669 else {
3670 arg = PyBytes_FromObject(arg);
3671 if (!arg)
3672 return 0;
3673 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3674 PyBytes_GET_SIZE(arg));
3675 Py_DECREF(arg);
3676 if (!output)
3677 return 0;
3678 if (!PyUnicode_Check(output)) {
3679 Py_DECREF(output);
3680 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3681 return 0;
3682 }
3683 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003684 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003685 Py_DECREF(output);
3686 return 0;
3687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003688 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003689 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003690 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003691 Py_DECREF(output);
3692 return 0;
3693 }
3694 *(PyObject**)addr = output;
3695 return Py_CLEANUP_SUPPORTED;
3696}
3697
3698
Martin v. Löwis5b222132007-06-10 09:51:05 +00003699char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003700PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003701{
Christian Heimesf3863112007-11-22 07:46:41 +00003702 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003703
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003704 if (!PyUnicode_Check(unicode)) {
3705 PyErr_BadArgument();
3706 return NULL;
3707 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003708 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003709 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003710
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003711 if (PyUnicode_UTF8(unicode) == NULL) {
3712 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3714 if (bytes == NULL)
3715 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003716 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3717 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003718 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003719 Py_DECREF(bytes);
3720 return NULL;
3721 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003722 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3723 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3724 PyBytes_AS_STRING(bytes),
3725 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003726 Py_DECREF(bytes);
3727 }
3728
3729 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003730 *psize = PyUnicode_UTF8_LENGTH(unicode);
3731 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003732}
3733
3734char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3738}
3739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740Py_UNICODE *
3741PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3742{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743 const unsigned char *one_byte;
3744#if SIZEOF_WCHAR_T == 4
3745 const Py_UCS2 *two_bytes;
3746#else
3747 const Py_UCS4 *four_bytes;
3748 const Py_UCS4 *ucs4_end;
3749 Py_ssize_t num_surrogates;
3750#endif
3751 wchar_t *w;
3752 wchar_t *wchar_end;
3753
3754 if (!PyUnicode_Check(unicode)) {
3755 PyErr_BadArgument();
3756 return NULL;
3757 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003758 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003759 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003760 assert(_PyUnicode_KIND(unicode) != 0);
3761 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003763 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003765 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3766 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767 num_surrogates = 0;
3768
3769 for (; four_bytes < ucs4_end; ++four_bytes) {
3770 if (*four_bytes > 0xFFFF)
3771 ++num_surrogates;
3772 }
3773
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3775 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3776 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003777 PyErr_NoMemory();
3778 return NULL;
3779 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003780 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003782 w = _PyUnicode_WSTR(unicode);
3783 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3784 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3786 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003787 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003789 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3790 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791 }
3792 else
3793 *w = *four_bytes;
3794
3795 if (w > wchar_end) {
3796 assert(0 && "Miscalculated string end");
3797 }
3798 }
3799 *w = 0;
3800#else
3801 /* sizeof(wchar_t) == 4 */
3802 Py_FatalError("Impossible unicode object state, wstr and str "
3803 "should share memory already.");
3804 return NULL;
3805#endif
3806 }
3807 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003808 if ((size_t)_PyUnicode_LENGTH(unicode) >
3809 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3810 PyErr_NoMemory();
3811 return NULL;
3812 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3814 (_PyUnicode_LENGTH(unicode) + 1));
3815 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816 PyErr_NoMemory();
3817 return NULL;
3818 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003819 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3820 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3821 w = _PyUnicode_WSTR(unicode);
3822 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003823
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003824 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3825 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826 for (; w < wchar_end; ++one_byte, ++w)
3827 *w = *one_byte;
3828 /* null-terminate the wstr */
3829 *w = 0;
3830 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003831 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003833 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834 for (; w < wchar_end; ++two_bytes, ++w)
3835 *w = *two_bytes;
3836 /* null-terminate the wstr */
3837 *w = 0;
3838#else
3839 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003840 PyObject_FREE(_PyUnicode_WSTR(unicode));
3841 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842 Py_FatalError("Impossible unicode object state, wstr "
3843 "and str should share memory already.");
3844 return NULL;
3845#endif
3846 }
3847 else {
3848 assert(0 && "This should never happen.");
3849 }
3850 }
3851 }
3852 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003853 *size = PyUnicode_WSTR_LENGTH(unicode);
3854 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003855}
3856
Alexander Belopolsky40018472011-02-26 01:02:56 +00003857Py_UNICODE *
3858PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003861}
3862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863
Alexander Belopolsky40018472011-02-26 01:02:56 +00003864Py_ssize_t
3865PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866{
3867 if (!PyUnicode_Check(unicode)) {
3868 PyErr_BadArgument();
3869 goto onError;
3870 }
3871 return PyUnicode_GET_SIZE(unicode);
3872
Benjamin Peterson29060642009-01-31 22:14:21 +00003873 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874 return -1;
3875}
3876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877Py_ssize_t
3878PyUnicode_GetLength(PyObject *unicode)
3879{
Victor Stinner07621332012-06-16 04:53:46 +02003880 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 PyErr_BadArgument();
3882 return -1;
3883 }
Victor Stinner07621332012-06-16 04:53:46 +02003884 if (PyUnicode_READY(unicode) == -1)
3885 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886 return PyUnicode_GET_LENGTH(unicode);
3887}
3888
3889Py_UCS4
3890PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3891{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003892 void *data;
3893 int kind;
3894
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003895 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3896 PyErr_BadArgument();
3897 return (Py_UCS4)-1;
3898 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003899 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003900 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901 return (Py_UCS4)-1;
3902 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003903 data = PyUnicode_DATA(unicode);
3904 kind = PyUnicode_KIND(unicode);
3905 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906}
3907
3908int
3909PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3910{
3911 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003912 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913 return -1;
3914 }
Victor Stinner488fa492011-12-12 00:01:39 +01003915 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003916 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003917 PyErr_SetString(PyExc_IndexError, "string index out of range");
3918 return -1;
3919 }
Victor Stinner488fa492011-12-12 00:01:39 +01003920 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003921 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003922 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3923 PyErr_SetString(PyExc_ValueError, "character out of range");
3924 return -1;
3925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3927 index, ch);
3928 return 0;
3929}
3930
Alexander Belopolsky40018472011-02-26 01:02:56 +00003931const char *
3932PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003933{
Victor Stinner42cb4622010-09-01 19:39:01 +00003934 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003935}
3936
Victor Stinner554f3f02010-06-16 23:33:54 +00003937/* create or adjust a UnicodeDecodeError */
3938static void
3939make_decode_exception(PyObject **exceptionObject,
3940 const char *encoding,
3941 const char *input, Py_ssize_t length,
3942 Py_ssize_t startpos, Py_ssize_t endpos,
3943 const char *reason)
3944{
3945 if (*exceptionObject == NULL) {
3946 *exceptionObject = PyUnicodeDecodeError_Create(
3947 encoding, input, length, startpos, endpos, reason);
3948 }
3949 else {
3950 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3951 goto onError;
3952 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3953 goto onError;
3954 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3955 goto onError;
3956 }
3957 return;
3958
3959onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02003960 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00003961}
3962
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003963#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964/* error handling callback helper:
3965 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003966 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003967 and adjust various state variables.
3968 return 0 on success, -1 on error
3969*/
3970
Alexander Belopolsky40018472011-02-26 01:02:56 +00003971static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003972unicode_decode_call_errorhandler_wchar(
3973 const char *errors, PyObject **errorHandler,
3974 const char *encoding, const char *reason,
3975 const char **input, const char **inend, Py_ssize_t *startinpos,
3976 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3977 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003979 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980
3981 PyObject *restuple = NULL;
3982 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003983 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003984 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003985 Py_ssize_t requiredsize;
3986 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003987 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003988 wchar_t *repwstr;
3989 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003991 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3992 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003993
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003994 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003995 *errorHandler = PyCodec_LookupError(errors);
3996 if (*errorHandler == NULL)
3997 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998 }
3999
Victor Stinner554f3f02010-06-16 23:33:54 +00004000 make_decode_exception(exceptionObject,
4001 encoding,
4002 *input, *inend - *input,
4003 *startinpos, *endinpos,
4004 reason);
4005 if (*exceptionObject == NULL)
4006 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007
4008 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4009 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004012 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004013 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004014 }
4015 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004016 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004017
4018 /* Copy back the bytes variables, which might have been modified by the
4019 callback */
4020 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4021 if (!inputobj)
4022 goto onError;
4023 if (!PyBytes_Check(inputobj)) {
4024 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4025 }
4026 *input = PyBytes_AS_STRING(inputobj);
4027 insize = PyBytes_GET_SIZE(inputobj);
4028 *inend = *input + insize;
4029 /* we can DECREF safely, as the exception has another reference,
4030 so the object won't go away. */
4031 Py_DECREF(inputobj);
4032
4033 if (newpos<0)
4034 newpos = insize+newpos;
4035 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004036 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004037 goto onError;
4038 }
4039
4040 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4041 if (repwstr == NULL)
4042 goto onError;
4043 /* need more space? (at least enough for what we
4044 have+the replacement+the rest of the string (starting
4045 at the new input position), so we won't have to check space
4046 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004047 requiredsize = *outpos;
4048 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4049 goto overflow;
4050 requiredsize += repwlen;
4051 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4052 goto overflow;
4053 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004054 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004055 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004056 requiredsize = 2*outsize;
4057 if (unicode_resize(output, requiredsize) < 0)
4058 goto onError;
4059 }
4060 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4061 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004062 *endinpos = newpos;
4063 *inptr = *input + newpos;
4064
4065 /* we made it! */
4066 Py_XDECREF(restuple);
4067 return 0;
4068
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004069 overflow:
4070 PyErr_SetString(PyExc_OverflowError,
4071 "decoded result is too long for a Python string");
4072
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004073 onError:
4074 Py_XDECREF(restuple);
4075 return -1;
4076}
4077#endif /* HAVE_MBCS */
4078
4079static int
4080unicode_decode_call_errorhandler_writer(
4081 const char *errors, PyObject **errorHandler,
4082 const char *encoding, const char *reason,
4083 const char **input, const char **inend, Py_ssize_t *startinpos,
4084 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4085 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4086{
4087 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4088
4089 PyObject *restuple = NULL;
4090 PyObject *repunicode = NULL;
4091 Py_ssize_t insize;
4092 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004093 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004094 PyObject *inputobj = NULL;
4095
4096 if (*errorHandler == NULL) {
4097 *errorHandler = PyCodec_LookupError(errors);
4098 if (*errorHandler == NULL)
4099 goto onError;
4100 }
4101
4102 make_decode_exception(exceptionObject,
4103 encoding,
4104 *input, *inend - *input,
4105 *startinpos, *endinpos,
4106 reason);
4107 if (*exceptionObject == NULL)
4108 goto onError;
4109
4110 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4111 if (restuple == NULL)
4112 goto onError;
4113 if (!PyTuple_Check(restuple)) {
4114 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4115 goto onError;
4116 }
4117 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004118 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004119
4120 /* Copy back the bytes variables, which might have been modified by the
4121 callback */
4122 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4123 if (!inputobj)
4124 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004125 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004127 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004128 *input = PyBytes_AS_STRING(inputobj);
4129 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004130 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004131 /* we can DECREF safely, as the exception has another reference,
4132 so the object won't go away. */
4133 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004134
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004136 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004137 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004138 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004140 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004141
Victor Stinner8f674cc2013-04-17 23:02:17 +02004142 if (PyUnicode_READY(repunicode) < 0)
4143 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004144 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004145 if (replen > 1) {
4146 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004147 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004148 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4149 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4150 goto onError;
4151 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004152 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004153 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004154
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004156 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004157
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004158 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004159 Py_XDECREF(restuple);
4160 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161
Benjamin Peterson29060642009-01-31 22:14:21 +00004162 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004164 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004165}
4166
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004167/* --- UTF-7 Codec -------------------------------------------------------- */
4168
Antoine Pitrou244651a2009-05-04 18:56:13 +00004169/* See RFC2152 for details. We encode conservatively and decode liberally. */
4170
4171/* Three simple macros defining base-64. */
4172
4173/* Is c a base-64 character? */
4174
4175#define IS_BASE64(c) \
4176 (((c) >= 'A' && (c) <= 'Z') || \
4177 ((c) >= 'a' && (c) <= 'z') || \
4178 ((c) >= '0' && (c) <= '9') || \
4179 (c) == '+' || (c) == '/')
4180
4181/* given that c is a base-64 character, what is its base-64 value? */
4182
4183#define FROM_BASE64(c) \
4184 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4185 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4186 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4187 (c) == '+' ? 62 : 63)
4188
4189/* What is the base-64 character of the bottom 6 bits of n? */
4190
4191#define TO_BASE64(n) \
4192 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4193
4194/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4195 * decoded as itself. We are permissive on decoding; the only ASCII
4196 * byte not decoding to itself is the + which begins a base64
4197 * string. */
4198
4199#define DECODE_DIRECT(c) \
4200 ((c) <= 127 && (c) != '+')
4201
4202/* The UTF-7 encoder treats ASCII characters differently according to
4203 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4204 * the above). See RFC2152. This array identifies these different
4205 * sets:
4206 * 0 : "Set D"
4207 * alphanumeric and '(),-./:?
4208 * 1 : "Set O"
4209 * !"#$%&*;<=>@[]^_`{|}
4210 * 2 : "whitespace"
4211 * ht nl cr sp
4212 * 3 : special (must be base64 encoded)
4213 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4214 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004215
Tim Petersced69f82003-09-16 20:30:58 +00004216static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004217char utf7_category[128] = {
4218/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4219 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4220/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4221 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4222/* sp ! " # $ % & ' ( ) * + , - . / */
4223 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4224/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4226/* @ A B C D E F G H I J K L M N O */
4227 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4228/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4230/* ` a b c d e f g h i j k l m n o */
4231 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4232/* p q r s t u v w x y z { | } ~ del */
4233 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004234};
4235
Antoine Pitrou244651a2009-05-04 18:56:13 +00004236/* ENCODE_DIRECT: this character should be encoded as itself. The
4237 * answer depends on whether we are encoding set O as itself, and also
4238 * on whether we are encoding whitespace as itself. RFC2152 makes it
4239 * clear that the answers to these questions vary between
4240 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004241
Antoine Pitrou244651a2009-05-04 18:56:13 +00004242#define ENCODE_DIRECT(c, directO, directWS) \
4243 ((c) < 128 && (c) > 0 && \
4244 ((utf7_category[(c)] == 0) || \
4245 (directWS && (utf7_category[(c)] == 2)) || \
4246 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004247
Alexander Belopolsky40018472011-02-26 01:02:56 +00004248PyObject *
4249PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004250 Py_ssize_t size,
4251 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004252{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004253 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4254}
4255
Antoine Pitrou244651a2009-05-04 18:56:13 +00004256/* The decoder. The only state we preserve is our read position,
4257 * i.e. how many characters we have consumed. So if we end in the
4258 * middle of a shift sequence we have to back off the read position
4259 * and the output to the beginning of the sequence, otherwise we lose
4260 * all the shift state (seen bits, number of bits seen, high
4261 * surrogate). */
4262
Alexander Belopolsky40018472011-02-26 01:02:56 +00004263PyObject *
4264PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004265 Py_ssize_t size,
4266 const char *errors,
4267 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004268{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004270 Py_ssize_t startinpos;
4271 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004272 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004273 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004274 const char *errmsg = "";
4275 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004276 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004277 unsigned int base64bits = 0;
4278 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004279 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004280 PyObject *errorHandler = NULL;
4281 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004283 if (size == 0) {
4284 if (consumed)
4285 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004286 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004287 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004289 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004290 _PyUnicodeWriter_Init(&writer);
4291 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292
4293 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294 e = s + size;
4295
4296 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004297 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004298 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004299 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004300
Antoine Pitrou244651a2009-05-04 18:56:13 +00004301 if (inShift) { /* in a base-64 section */
4302 if (IS_BASE64(ch)) { /* consume a base-64 character */
4303 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4304 base64bits += 6;
4305 s++;
4306 if (base64bits >= 16) {
4307 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004308 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309 base64bits -= 16;
4310 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004311 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004312 if (surrogate) {
4313 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004314 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4315 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004316 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004317 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004319 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004320 }
4321 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004322 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004323 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325 }
4326 }
Victor Stinner551ac952011-11-29 22:58:13 +01004327 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 /* first surrogate */
4329 surrogate = outCh;
4330 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004331 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004332 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004333 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 }
4335 }
4336 }
4337 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004339 if (base64bits > 0) { /* left-over bits */
4340 if (base64bits >= 6) {
4341 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004342 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 errmsg = "partial character in shift sequence";
4344 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004346 else {
4347 /* Some bits remain; they should be zero */
4348 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004349 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 errmsg = "non-zero padding bits in shift sequence";
4351 goto utf7Error;
4352 }
4353 }
4354 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004355 if (surrogate && DECODE_DIRECT(ch)) {
4356 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4357 goto onError;
4358 }
4359 surrogate = 0;
4360 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 /* '-' is absorbed; other terminating
4362 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004363 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004364 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365 }
4366 }
4367 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004368 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 s++; /* consume '+' */
4370 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004371 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004372 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004373 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 }
4375 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004376 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004377 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004378 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004379 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004380 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381 }
4382 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004385 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004386 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388 else {
4389 startinpos = s-starts;
4390 s++;
4391 errmsg = "unexpected special character";
4392 goto utf7Error;
4393 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004394 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004395utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004396 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004397 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 errors, &errorHandler,
4399 "utf7", errmsg,
4400 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004401 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004402 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004403 }
4404
Antoine Pitrou244651a2009-05-04 18:56:13 +00004405 /* end of string */
4406
4407 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4408 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004409 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410 if (surrogate ||
4411 (base64bits >= 6) ||
4412 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004414 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 errors, &errorHandler,
4416 "utf7", "unterminated shift sequence",
4417 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004418 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 goto onError;
4420 if (s < e)
4421 goto restart;
4422 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424
4425 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004426 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004428 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004429 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004430 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004431 writer.kind, writer.data, shiftOutStart);
4432 Py_XDECREF(errorHandler);
4433 Py_XDECREF(exc);
4434 _PyUnicodeWriter_Dealloc(&writer);
4435 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004436 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004437 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 }
4439 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004440 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004441 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004442 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 Py_XDECREF(errorHandler);
4445 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004446 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004447
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449 Py_XDECREF(errorHandler);
4450 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004451 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004452 return NULL;
4453}
4454
4455
Alexander Belopolsky40018472011-02-26 01:02:56 +00004456PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004457_PyUnicode_EncodeUTF7(PyObject *str,
4458 int base64SetO,
4459 int base64WhiteSpace,
4460 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004461{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004462 int kind;
4463 void *data;
4464 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004465 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004467 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 unsigned int base64bits = 0;
4469 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 char * out;
4471 char * start;
4472
Benjamin Petersonbac79492012-01-14 13:34:47 -05004473 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004474 return NULL;
4475 kind = PyUnicode_KIND(str);
4476 data = PyUnicode_DATA(str);
4477 len = PyUnicode_GET_LENGTH(str);
4478
4479 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004481
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004482 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004483 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004484 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004485 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486 if (v == NULL)
4487 return NULL;
4488
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004489 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004490 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004491 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 if (inShift) {
4494 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4495 /* shifting out */
4496 if (base64bits) { /* output remaining bits */
4497 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4498 base64buffer = 0;
4499 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500 }
4501 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 /* Characters not in the BASE64 set implicitly unshift the sequence
4503 so no '-' is required, except if the character is itself a '-' */
4504 if (IS_BASE64(ch) || ch == '-') {
4505 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 *out++ = (char) ch;
4508 }
4509 else {
4510 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004511 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004513 else { /* not in a shift sequence */
4514 if (ch == '+') {
4515 *out++ = '+';
4516 *out++ = '-';
4517 }
4518 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4519 *out++ = (char) ch;
4520 }
4521 else {
4522 *out++ = '+';
4523 inShift = 1;
4524 goto encode_char;
4525 }
4526 }
4527 continue;
4528encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004530 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004531
Antoine Pitrou244651a2009-05-04 18:56:13 +00004532 /* code first surrogate */
4533 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004534 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 while (base64bits >= 6) {
4536 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4537 base64bits -= 6;
4538 }
4539 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004540 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004541 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 base64bits += 16;
4543 base64buffer = (base64buffer << 16) | ch;
4544 while (base64bits >= 6) {
4545 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4546 base64bits -= 6;
4547 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004548 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549 if (base64bits)
4550 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4551 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004552 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004553 if (_PyBytes_Resize(&v, out - start) < 0)
4554 return NULL;
4555 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004557PyObject *
4558PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4559 Py_ssize_t size,
4560 int base64SetO,
4561 int base64WhiteSpace,
4562 const char *errors)
4563{
4564 PyObject *result;
4565 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4566 if (tmp == NULL)
4567 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004568 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004569 base64WhiteSpace, errors);
4570 Py_DECREF(tmp);
4571 return result;
4572}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574#undef IS_BASE64
4575#undef FROM_BASE64
4576#undef TO_BASE64
4577#undef DECODE_DIRECT
4578#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580/* --- UTF-8 Codec -------------------------------------------------------- */
4581
Alexander Belopolsky40018472011-02-26 01:02:56 +00004582PyObject *
4583PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004584 Py_ssize_t size,
4585 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586{
Walter Dörwald69652032004-09-07 20:24:22 +00004587 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4588}
4589
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004590#include "stringlib/asciilib.h"
4591#include "stringlib/codecs.h"
4592#include "stringlib/undef.h"
4593
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004594#include "stringlib/ucs1lib.h"
4595#include "stringlib/codecs.h"
4596#include "stringlib/undef.h"
4597
4598#include "stringlib/ucs2lib.h"
4599#include "stringlib/codecs.h"
4600#include "stringlib/undef.h"
4601
4602#include "stringlib/ucs4lib.h"
4603#include "stringlib/codecs.h"
4604#include "stringlib/undef.h"
4605
Antoine Pitrouab868312009-01-10 15:40:25 +00004606/* Mask to quickly check whether a C 'long' contains a
4607 non-ASCII, UTF8-encoded char. */
4608#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004609# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004610#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004611# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004612#else
4613# error C 'long' size should be either 4 or 8!
4614#endif
4615
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004616static Py_ssize_t
4617ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004618{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004619 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004620 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004621
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004622 /*
4623 * Issue #17237: m68k is a bit different from most architectures in
4624 * that objects do not use "natural alignment" - for example, int and
4625 * long are only aligned at 2-byte boundaries. Therefore the assert()
4626 * won't work; also, tests have shown that skipping the "optimised
4627 * version" will even speed up m68k.
4628 */
4629#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004630#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004631 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4632 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004633 /* Fast path, see in STRINGLIB(utf8_decode) for
4634 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004635 /* Help allocation */
4636 const char *_p = p;
4637 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004638 while (_p < aligned_end) {
4639 unsigned long value = *(const unsigned long *) _p;
4640 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004641 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004642 *((unsigned long *)q) = value;
4643 _p += SIZEOF_LONG;
4644 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004645 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004646 p = _p;
4647 while (p < end) {
4648 if ((unsigned char)*p & 0x80)
4649 break;
4650 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004652 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004654#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004655#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004656 while (p < end) {
4657 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4658 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004659 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004660 /* Help allocation */
4661 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004662 while (_p < aligned_end) {
4663 unsigned long value = *(unsigned long *) _p;
4664 if (value & ASCII_CHAR_MASK)
4665 break;
4666 _p += SIZEOF_LONG;
4667 }
4668 p = _p;
4669 if (_p == end)
4670 break;
4671 }
4672 if ((unsigned char)*p & 0x80)
4673 break;
4674 ++p;
4675 }
4676 memcpy(dest, start, p - start);
4677 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678}
Antoine Pitrouab868312009-01-10 15:40:25 +00004679
Victor Stinner785938e2011-12-11 20:09:03 +01004680PyObject *
4681PyUnicode_DecodeUTF8Stateful(const char *s,
4682 Py_ssize_t size,
4683 const char *errors,
4684 Py_ssize_t *consumed)
4685{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004686 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004687 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004689
4690 Py_ssize_t startinpos;
4691 Py_ssize_t endinpos;
4692 const char *errmsg = "";
4693 PyObject *errorHandler = NULL;
4694 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004695
4696 if (size == 0) {
4697 if (consumed)
4698 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004699 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004700 }
4701
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004702 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4703 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004704 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004705 *consumed = 1;
4706 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004707 }
4708
Victor Stinner8f674cc2013-04-17 23:02:17 +02004709 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004710 writer.min_length = size;
4711 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004712 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004713
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004714 writer.pos = ascii_decode(s, end, writer.data);
4715 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 while (s < end) {
4717 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004719 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004720 if (PyUnicode_IS_ASCII(writer.buffer))
4721 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004723 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004724 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004725 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004726 } else {
4727 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004728 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004729 }
4730
4731 switch (ch) {
4732 case 0:
4733 if (s == end || consumed)
4734 goto End;
4735 errmsg = "unexpected end of data";
4736 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004737 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004738 break;
4739 case 1:
4740 errmsg = "invalid start byte";
4741 startinpos = s - starts;
4742 endinpos = startinpos + 1;
4743 break;
4744 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004745 case 3:
4746 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004747 errmsg = "invalid continuation byte";
4748 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004749 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004750 break;
4751 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004752 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004753 goto onError;
4754 continue;
4755 }
4756
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004757 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 errors, &errorHandler,
4759 "utf-8", errmsg,
4760 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004761 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004762 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004763 }
4764
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004765End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004766 if (consumed)
4767 *consumed = s - starts;
4768
4769 Py_XDECREF(errorHandler);
4770 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004771 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772
4773onError:
4774 Py_XDECREF(errorHandler);
4775 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004776 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004777 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004778}
4779
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004780#ifdef __APPLE__
4781
4782/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004783 used to decode the command line arguments on Mac OS X.
4784
4785 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004786 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004787
4788wchar_t*
4789_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4790{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004791 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004792 wchar_t *unicode;
4793 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004794
4795 /* Note: size will always be longer than the resulting Unicode
4796 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004797 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004798 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004799 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004800 if (!unicode)
4801 return NULL;
4802
4803 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004804 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004805 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004806 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004807 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004808#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004810#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004812#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004813 if (ch > 0xFF) {
4814#if SIZEOF_WCHAR_T == 4
4815 assert(0);
4816#else
4817 assert(Py_UNICODE_IS_SURROGATE(ch));
4818 /* compute and append the two surrogates: */
4819 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4820 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4821#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004822 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004823 else {
4824 if (!ch && s == e)
4825 break;
4826 /* surrogateescape */
4827 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4828 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004829 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004830 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004831 return unicode;
4832}
4833
4834#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004836/* Primary internal function which creates utf8 encoded bytes objects.
4837
4838 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004839 and allocate exactly as much space needed at the end. Else allocate the
4840 maximum possible needed (4 result bytes per Unicode character), and return
4841 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004842*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004843PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004844_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845{
Victor Stinner6099a032011-12-18 14:22:26 +01004846 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004847 void *data;
4848 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004850 if (!PyUnicode_Check(unicode)) {
4851 PyErr_BadArgument();
4852 return NULL;
4853 }
4854
4855 if (PyUnicode_READY(unicode) == -1)
4856 return NULL;
4857
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004858 if (PyUnicode_UTF8(unicode))
4859 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4860 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004861
4862 kind = PyUnicode_KIND(unicode);
4863 data = PyUnicode_DATA(unicode);
4864 size = PyUnicode_GET_LENGTH(unicode);
4865
Benjamin Petersonead6b532011-12-20 17:23:42 -06004866 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004867 default:
4868 assert(0);
4869 case PyUnicode_1BYTE_KIND:
4870 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4871 assert(!PyUnicode_IS_ASCII(unicode));
4872 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4873 case PyUnicode_2BYTE_KIND:
4874 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4875 case PyUnicode_4BYTE_KIND:
4876 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004877 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878}
4879
Alexander Belopolsky40018472011-02-26 01:02:56 +00004880PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004881PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4882 Py_ssize_t size,
4883 const char *errors)
4884{
4885 PyObject *v, *unicode;
4886
4887 unicode = PyUnicode_FromUnicode(s, size);
4888 if (unicode == NULL)
4889 return NULL;
4890 v = _PyUnicode_AsUTF8String(unicode, errors);
4891 Py_DECREF(unicode);
4892 return v;
4893}
4894
4895PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004896PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004898 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004899}
4900
Walter Dörwald41980ca2007-08-16 21:55:45 +00004901/* --- UTF-32 Codec ------------------------------------------------------- */
4902
4903PyObject *
4904PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004905 Py_ssize_t size,
4906 const char *errors,
4907 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004908{
4909 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4910}
4911
4912PyObject *
4913PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004914 Py_ssize_t size,
4915 const char *errors,
4916 int *byteorder,
4917 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004918{
4919 const char *starts = s;
4920 Py_ssize_t startinpos;
4921 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004922 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004923 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004924 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004925 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004926 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004927 PyObject *errorHandler = NULL;
4928 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004929
Walter Dörwald41980ca2007-08-16 21:55:45 +00004930 q = (unsigned char *)s;
4931 e = q + size;
4932
4933 if (byteorder)
4934 bo = *byteorder;
4935
4936 /* Check for BOM marks (U+FEFF) in the input and adjust current
4937 byte order setting accordingly. In native mode, the leading BOM
4938 mark is skipped, in all other modes, it is copied to the output
4939 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004940 if (bo == 0 && size >= 4) {
4941 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4942 if (bom == 0x0000FEFF) {
4943 bo = -1;
4944 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004945 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004946 else if (bom == 0xFFFE0000) {
4947 bo = 1;
4948 q += 4;
4949 }
4950 if (byteorder)
4951 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952 }
4953
Victor Stinnere64322e2012-10-30 23:12:47 +01004954 if (q == e) {
4955 if (consumed)
4956 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004957 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958 }
4959
Victor Stinnere64322e2012-10-30 23:12:47 +01004960#ifdef WORDS_BIGENDIAN
4961 le = bo < 0;
4962#else
4963 le = bo <= 0;
4964#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004965 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004966
Victor Stinner8f674cc2013-04-17 23:02:17 +02004967 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004968 writer.min_length = (e - q + 3) / 4;
4969 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004970 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004971
Victor Stinnere64322e2012-10-30 23:12:47 +01004972 while (1) {
4973 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004974 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004975
Victor Stinnere64322e2012-10-30 23:12:47 +01004976 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004977 enum PyUnicode_Kind kind = writer.kind;
4978 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004979 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004980 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004981 if (le) {
4982 do {
4983 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4984 if (ch > maxch)
4985 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004986 if (kind != PyUnicode_1BYTE_KIND &&
4987 Py_UNICODE_IS_SURROGATE(ch))
4988 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004989 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004990 q += 4;
4991 } while (q <= last);
4992 }
4993 else {
4994 do {
4995 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4996 if (ch > maxch)
4997 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004998 if (kind != PyUnicode_1BYTE_KIND &&
4999 Py_UNICODE_IS_SURROGATE(ch))
5000 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005001 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005002 q += 4;
5003 } while (q <= last);
5004 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005005 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005006 }
5007
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005008 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005009 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005010 startinpos = ((const char *)q) - starts;
5011 endinpos = startinpos + 4;
5012 }
5013 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005014 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005015 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005016 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005017 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005018 startinpos = ((const char *)q) - starts;
5019 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005021 else {
5022 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005023 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005024 goto onError;
5025 q += 4;
5026 continue;
5027 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005028 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005029 startinpos = ((const char *)q) - starts;
5030 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005032
5033 /* The remaining input chars are ignored if the callback
5034 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005035 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005037 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005039 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041 }
5042
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005045
Walter Dörwald41980ca2007-08-16 21:55:45 +00005046 Py_XDECREF(errorHandler);
5047 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005048 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005049
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005051 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005052 Py_XDECREF(errorHandler);
5053 Py_XDECREF(exc);
5054 return NULL;
5055}
5056
5057PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005058_PyUnicode_EncodeUTF32(PyObject *str,
5059 const char *errors,
5060 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005062 enum PyUnicode_Kind kind;
5063 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005064 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005065 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005066 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005067#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005068 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005069#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005070 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005072 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005073 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005074 PyObject *errorHandler = NULL;
5075 PyObject *exc = NULL;
5076 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005078 if (!PyUnicode_Check(str)) {
5079 PyErr_BadArgument();
5080 return NULL;
5081 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005082 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005083 return NULL;
5084 kind = PyUnicode_KIND(str);
5085 data = PyUnicode_DATA(str);
5086 len = PyUnicode_GET_LENGTH(str);
5087
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005088 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005089 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005090 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005091 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092 if (v == NULL)
5093 return NULL;
5094
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005095 /* output buffer is 4-bytes aligned */
5096 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5097 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005099 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005100 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005101 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005102
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005103 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005104 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005105 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005106 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005107 else
5108 encoding = "utf-32";
5109
5110 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005111 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5112 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113 }
5114
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005115 pos = 0;
5116 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005117 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005118
5119 if (kind == PyUnicode_2BYTE_KIND) {
5120 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5121 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005122 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005123 else {
5124 assert(kind == PyUnicode_4BYTE_KIND);
5125 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5126 &out, native_ordering);
5127 }
5128 if (pos == len)
5129 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005130
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005131 rep = unicode_encode_call_errorhandler(
5132 errors, &errorHandler,
5133 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005134 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005135 if (!rep)
5136 goto error;
5137
5138 if (PyBytes_Check(rep)) {
5139 repsize = PyBytes_GET_SIZE(rep);
5140 if (repsize & 3) {
5141 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005142 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005143 "surrogates not allowed");
5144 goto error;
5145 }
5146 moreunits = repsize / 4;
5147 }
5148 else {
5149 assert(PyUnicode_Check(rep));
5150 if (PyUnicode_READY(rep) < 0)
5151 goto error;
5152 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5153 if (!PyUnicode_IS_ASCII(rep)) {
5154 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005155 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005156 "surrogates not allowed");
5157 goto error;
5158 }
5159 }
5160
5161 /* four bytes are reserved for each surrogate */
5162 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005163 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005164 Py_ssize_t morebytes = 4 * (moreunits - 1);
5165 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5166 /* integer overflow */
5167 PyErr_NoMemory();
5168 goto error;
5169 }
5170 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5171 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005172 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005173 }
5174
5175 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005176 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5177 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005178 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005179 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005180 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5181 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005182 }
5183
5184 Py_CLEAR(rep);
5185 }
5186
5187 /* Cut back to size actually needed. This is necessary for, for example,
5188 encoding of a string containing isolated surrogates and the 'ignore'
5189 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005190 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005191 if (nsize != PyBytes_GET_SIZE(v))
5192 _PyBytes_Resize(&v, nsize);
5193 Py_XDECREF(errorHandler);
5194 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005195 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005196 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005197 error:
5198 Py_XDECREF(rep);
5199 Py_XDECREF(errorHandler);
5200 Py_XDECREF(exc);
5201 Py_XDECREF(v);
5202 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005203}
5204
Alexander Belopolsky40018472011-02-26 01:02:56 +00005205PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005206PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5207 Py_ssize_t size,
5208 const char *errors,
5209 int byteorder)
5210{
5211 PyObject *result;
5212 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5213 if (tmp == NULL)
5214 return NULL;
5215 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5216 Py_DECREF(tmp);
5217 return result;
5218}
5219
5220PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005221PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005222{
Victor Stinnerb960b342011-11-20 19:12:52 +01005223 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005224}
5225
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226/* --- UTF-16 Codec ------------------------------------------------------- */
5227
Tim Peters772747b2001-08-09 22:21:55 +00005228PyObject *
5229PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005230 Py_ssize_t size,
5231 const char *errors,
5232 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233{
Walter Dörwald69652032004-09-07 20:24:22 +00005234 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5235}
5236
5237PyObject *
5238PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 Py_ssize_t size,
5240 const char *errors,
5241 int *byteorder,
5242 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005243{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005244 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005245 Py_ssize_t startinpos;
5246 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005247 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005248 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005249 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005250 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005251 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005252 PyObject *errorHandler = NULL;
5253 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005254 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
Tim Peters772747b2001-08-09 22:21:55 +00005256 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005257 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258
5259 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005260 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005262 /* Check for BOM marks (U+FEFF) in the input and adjust current
5263 byte order setting accordingly. In native mode, the leading BOM
5264 mark is skipped, in all other modes, it is copied to the output
5265 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005266 if (bo == 0 && size >= 2) {
5267 const Py_UCS4 bom = (q[1] << 8) | q[0];
5268 if (bom == 0xFEFF) {
5269 q += 2;
5270 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005272 else if (bom == 0xFFFE) {
5273 q += 2;
5274 bo = 1;
5275 }
5276 if (byteorder)
5277 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279
Antoine Pitrou63065d72012-05-15 23:48:04 +02005280 if (q == e) {
5281 if (consumed)
5282 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005283 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005284 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005285
Christian Heimes743e0cd2012-10-17 23:52:17 +02005286#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005287 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005288 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005289#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005290 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005291 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005292#endif
Tim Peters772747b2001-08-09 22:21:55 +00005293
Antoine Pitrou63065d72012-05-15 23:48:04 +02005294 /* Note: size will always be longer than the resulting Unicode
5295 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005296 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005297 writer.min_length = (e - q + 1) / 2;
5298 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005299 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005300
Antoine Pitrou63065d72012-05-15 23:48:04 +02005301 while (1) {
5302 Py_UCS4 ch = 0;
5303 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005304 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005305 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005306 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005307 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005308 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005309 native_ordering);
5310 else
5311 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005312 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005313 native_ordering);
5314 } else if (kind == PyUnicode_2BYTE_KIND) {
5315 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005316 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005317 native_ordering);
5318 } else {
5319 assert(kind == PyUnicode_4BYTE_KIND);
5320 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005321 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005322 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005323 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005324 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005325
Antoine Pitrou63065d72012-05-15 23:48:04 +02005326 switch (ch)
5327 {
5328 case 0:
5329 /* remaining byte at the end? (size should be even) */
5330 if (q == e || consumed)
5331 goto End;
5332 errmsg = "truncated data";
5333 startinpos = ((const char *)q) - starts;
5334 endinpos = ((const char *)e) - starts;
5335 break;
5336 /* The remaining input chars are ignored if the callback
5337 chooses to skip the input */
5338 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005339 q -= 2;
5340 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005341 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005342 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005343 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005344 endinpos = ((const char *)e) - starts;
5345 break;
5346 case 2:
5347 errmsg = "illegal encoding";
5348 startinpos = ((const char *)q) - 2 - starts;
5349 endinpos = startinpos + 2;
5350 break;
5351 case 3:
5352 errmsg = "illegal UTF-16 surrogate";
5353 startinpos = ((const char *)q) - 4 - starts;
5354 endinpos = startinpos + 2;
5355 break;
5356 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005357 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005358 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 continue;
5360 }
5361
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005362 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005363 errors,
5364 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005365 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005366 &starts,
5367 (const char **)&e,
5368 &startinpos,
5369 &endinpos,
5370 &exc,
5371 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005372 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374 }
5375
Antoine Pitrou63065d72012-05-15 23:48:04 +02005376End:
Walter Dörwald69652032004-09-07 20:24:22 +00005377 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005379
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380 Py_XDECREF(errorHandler);
5381 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005382 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005385 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005386 Py_XDECREF(errorHandler);
5387 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 return NULL;
5389}
5390
Tim Peters772747b2001-08-09 22:21:55 +00005391PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005392_PyUnicode_EncodeUTF16(PyObject *str,
5393 const char *errors,
5394 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005396 enum PyUnicode_Kind kind;
5397 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005398 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005399 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005400 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005401 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005402#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005403 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005404#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005405 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005406#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005407 const char *encoding;
5408 Py_ssize_t nsize, pos;
5409 PyObject *errorHandler = NULL;
5410 PyObject *exc = NULL;
5411 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005412
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005413 if (!PyUnicode_Check(str)) {
5414 PyErr_BadArgument();
5415 return NULL;
5416 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005417 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005418 return NULL;
5419 kind = PyUnicode_KIND(str);
5420 data = PyUnicode_DATA(str);
5421 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005422
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005423 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005424 if (kind == PyUnicode_4BYTE_KIND) {
5425 const Py_UCS4 *in = (const Py_UCS4 *)data;
5426 const Py_UCS4 *end = in + len;
5427 while (in < end)
5428 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005429 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005430 }
5431 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005432 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005433 nsize = len + pairs + (byteorder == 0);
5434 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 if (v == NULL)
5436 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005438 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005439 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005440 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005442 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005443 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005444 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005445
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 if (kind == PyUnicode_1BYTE_KIND) {
5447 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5448 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005449 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005450
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005451 if (byteorder < 0)
5452 encoding = "utf-16-le";
5453 else if (byteorder > 0)
5454 encoding = "utf-16-be";
5455 else
5456 encoding = "utf-16";
5457
5458 pos = 0;
5459 while (pos < len) {
5460 Py_ssize_t repsize, moreunits;
5461
5462 if (kind == PyUnicode_2BYTE_KIND) {
5463 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5464 &out, native_ordering);
5465 }
5466 else {
5467 assert(kind == PyUnicode_4BYTE_KIND);
5468 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5469 &out, native_ordering);
5470 }
5471 if (pos == len)
5472 break;
5473
5474 rep = unicode_encode_call_errorhandler(
5475 errors, &errorHandler,
5476 encoding, "surrogates not allowed",
5477 str, &exc, pos, pos + 1, &pos);
5478 if (!rep)
5479 goto error;
5480
5481 if (PyBytes_Check(rep)) {
5482 repsize = PyBytes_GET_SIZE(rep);
5483 if (repsize & 1) {
5484 raise_encode_exception(&exc, encoding,
5485 str, pos - 1, pos,
5486 "surrogates not allowed");
5487 goto error;
5488 }
5489 moreunits = repsize / 2;
5490 }
5491 else {
5492 assert(PyUnicode_Check(rep));
5493 if (PyUnicode_READY(rep) < 0)
5494 goto error;
5495 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5496 if (!PyUnicode_IS_ASCII(rep)) {
5497 raise_encode_exception(&exc, encoding,
5498 str, pos - 1, pos,
5499 "surrogates not allowed");
5500 goto error;
5501 }
5502 }
5503
5504 /* two bytes are reserved for each surrogate */
5505 if (moreunits > 1) {
5506 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5507 Py_ssize_t morebytes = 2 * (moreunits - 1);
5508 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5509 /* integer overflow */
5510 PyErr_NoMemory();
5511 goto error;
5512 }
5513 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5514 goto error;
5515 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5516 }
5517
5518 if (PyBytes_Check(rep)) {
5519 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5520 out += moreunits;
5521 } else /* rep is unicode */ {
5522 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5523 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5524 &out, native_ordering);
5525 }
5526
5527 Py_CLEAR(rep);
5528 }
5529
5530 /* Cut back to size actually needed. This is necessary for, for example,
5531 encoding of a string containing isolated surrogates and the 'ignore' handler
5532 is used. */
5533 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5534 if (nsize != PyBytes_GET_SIZE(v))
5535 _PyBytes_Resize(&v, nsize);
5536 Py_XDECREF(errorHandler);
5537 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005538 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005539 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005540 error:
5541 Py_XDECREF(rep);
5542 Py_XDECREF(errorHandler);
5543 Py_XDECREF(exc);
5544 Py_XDECREF(v);
5545 return NULL;
5546#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547}
5548
Alexander Belopolsky40018472011-02-26 01:02:56 +00005549PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005550PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5551 Py_ssize_t size,
5552 const char *errors,
5553 int byteorder)
5554{
5555 PyObject *result;
5556 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5557 if (tmp == NULL)
5558 return NULL;
5559 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5560 Py_DECREF(tmp);
5561 return result;
5562}
5563
5564PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005565PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005567 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568}
5569
5570/* --- Unicode Escape Codec ----------------------------------------------- */
5571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005572/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5573 if all the escapes in the string make it still a valid ASCII string.
5574 Returns -1 if any escapes were found which cause the string to
5575 pop out of ASCII range. Otherwise returns the length of the
5576 required buffer to hold the string.
5577 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005578static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005579length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5580{
5581 const unsigned char *p = (const unsigned char *)s;
5582 const unsigned char *end = p + size;
5583 Py_ssize_t length = 0;
5584
5585 if (size < 0)
5586 return -1;
5587
5588 for (; p < end; ++p) {
5589 if (*p > 127) {
5590 /* Non-ASCII */
5591 return -1;
5592 }
5593 else if (*p != '\\') {
5594 /* Normal character */
5595 ++length;
5596 }
5597 else {
5598 /* Backslash-escape, check next char */
5599 ++p;
5600 /* Escape sequence reaches till end of string or
5601 non-ASCII follow-up. */
5602 if (p >= end || *p > 127)
5603 return -1;
5604 switch (*p) {
5605 case '\n':
5606 /* backslash + \n result in zero characters */
5607 break;
5608 case '\\': case '\'': case '\"':
5609 case 'b': case 'f': case 't':
5610 case 'n': case 'r': case 'v': case 'a':
5611 ++length;
5612 break;
5613 case '0': case '1': case '2': case '3':
5614 case '4': case '5': case '6': case '7':
5615 case 'x': case 'u': case 'U': case 'N':
5616 /* these do not guarantee ASCII characters */
5617 return -1;
5618 default:
5619 /* count the backslash + the other character */
5620 length += 2;
5621 }
5622 }
5623 }
5624 return length;
5625}
5626
Fredrik Lundh06d12682001-01-24 07:59:11 +00005627static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005628
Alexander Belopolsky40018472011-02-26 01:02:56 +00005629PyObject *
5630PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005631 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005632 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005634 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005635 Py_ssize_t startinpos;
5636 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005637 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005639 char* message;
5640 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005641 PyObject *errorHandler = NULL;
5642 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005643 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005644
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005645 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005646 if (len == 0)
5647 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005648
5649 /* After length_of_escaped_ascii_string() there are two alternatives,
5650 either the string is pure ASCII with named escapes like \n, etc.
5651 and we determined it's exact size (common case)
5652 or it contains \x, \u, ... escape sequences. then we create a
5653 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005654 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005655 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005656 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005657 }
5658 else {
5659 /* Escaped strings will always be longer than the resulting
5660 Unicode string, so we start with size here and then reduce the
5661 length after conversion to the true value.
5662 (but if the error callback returns a long replacement string
5663 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005664 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005665 }
5666
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005668 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005670
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671 while (s < end) {
5672 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005673 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005674 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675
5676 /* Non-escape characters are interpreted as Unicode ordinals */
5677 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005678 x = (unsigned char)*s;
5679 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005680 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005681 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 continue;
5683 }
5684
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 /* \ - Escapes */
5687 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005688 c = *s++;
5689 if (s > end)
5690 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005691
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005692 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693
Benjamin Peterson29060642009-01-31 22:14:21 +00005694 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005695#define WRITECHAR(ch) \
5696 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005697 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005698 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005700
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005702 case '\\': WRITECHAR('\\'); break;
5703 case '\'': WRITECHAR('\''); break;
5704 case '\"': WRITECHAR('\"'); break;
5705 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005706 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005707 case 'f': WRITECHAR('\014'); break;
5708 case 't': WRITECHAR('\t'); break;
5709 case 'n': WRITECHAR('\n'); break;
5710 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005711 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005712 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005713 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005714 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 case '0': case '1': case '2': case '3':
5718 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005719 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005720 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005721 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005722 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005723 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005725 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 break;
5727
Benjamin Peterson29060642009-01-31 22:14:21 +00005728 /* hex escapes */
5729 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005731 digits = 2;
5732 message = "truncated \\xXX escape";
5733 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005737 digits = 4;
5738 message = "truncated \\uXXXX escape";
5739 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005742 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005743 digits = 8;
5744 message = "truncated \\UXXXXXXXX escape";
5745 hexescape:
5746 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005747 if (end - s < digits) {
5748 /* count only hex digits */
5749 for (; s < end; ++s) {
5750 c = (unsigned char)*s;
5751 if (!Py_ISXDIGIT(c))
5752 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005753 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005754 goto error;
5755 }
5756 for (; digits--; ++s) {
5757 c = (unsigned char)*s;
5758 if (!Py_ISXDIGIT(c))
5759 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005760 chr = (chr<<4) & ~0xF;
5761 if (c >= '0' && c <= '9')
5762 chr += c - '0';
5763 else if (c >= 'a' && c <= 'f')
5764 chr += 10 + c - 'a';
5765 else
5766 chr += 10 + c - 'A';
5767 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005768 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 /* _decoding_error will have already written into the
5770 target buffer. */
5771 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005772 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005773 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005774 message = "illegal Unicode character";
5775 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005776 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005777 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005778 break;
5779
Benjamin Peterson29060642009-01-31 22:14:21 +00005780 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005781 case 'N':
5782 message = "malformed \\N character escape";
5783 if (ucnhash_CAPI == NULL) {
5784 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005785 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5786 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005787 if (ucnhash_CAPI == NULL)
5788 goto ucnhashError;
5789 }
5790 if (*s == '{') {
5791 const char *start = s+1;
5792 /* look for the closing brace */
5793 while (*s != '}' && s < end)
5794 s++;
5795 if (s > start && s < end && *s == '}') {
5796 /* found a name. look it up in the unicode database */
5797 message = "unknown Unicode character name";
5798 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005799 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005800 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005801 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005802 goto store;
5803 }
5804 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005805 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005806
5807 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005808 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 message = "\\ at end of string";
5810 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005811 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005812 }
5813 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005814 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005815 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005816 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005817 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005819 continue;
5820
5821 error:
5822 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005823 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005824 errors, &errorHandler,
5825 "unicodeescape", message,
5826 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005827 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005828 goto onError;
5829 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005831#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005832
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005833 Py_XDECREF(errorHandler);
5834 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005835 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005836
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005838 PyErr_SetString(
5839 PyExc_UnicodeError,
5840 "\\N escapes not supported (can't load unicodedata module)"
5841 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005842 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005843 Py_XDECREF(errorHandler);
5844 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005845 return NULL;
5846
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005848 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 Py_XDECREF(errorHandler);
5850 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 return NULL;
5852}
5853
5854/* Return a Unicode-Escape string version of the Unicode object.
5855
5856 If quotes is true, the string is enclosed in u"" or u'' quotes as
5857 appropriate.
5858
5859*/
5860
Alexander Belopolsky40018472011-02-26 01:02:56 +00005861PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005862PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005864 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005865 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005867 int kind;
5868 void *data;
5869 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
Ezio Melottie7f90372012-10-05 03:33:31 +03005871 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005872 escape.
5873
Ezio Melottie7f90372012-10-05 03:33:31 +03005874 For UCS1 strings it's '\xxx', 4 bytes per source character.
5875 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5876 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005877 */
5878
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005879 if (!PyUnicode_Check(unicode)) {
5880 PyErr_BadArgument();
5881 return NULL;
5882 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005883 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005884 return NULL;
5885 len = PyUnicode_GET_LENGTH(unicode);
5886 kind = PyUnicode_KIND(unicode);
5887 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005888 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005889 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5890 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5891 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5892 }
5893
5894 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005895 return PyBytes_FromStringAndSize(NULL, 0);
5896
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005898 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005899
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005900 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904 if (repr == NULL)
5905 return NULL;
5906
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005907 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005909 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005910 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005911
Walter Dörwald79e913e2007-05-12 11:08:06 +00005912 /* Escape backslashes */
5913 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 *p++ = '\\';
5915 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005916 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005917 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005918
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005919 /* Map 21-bit characters to '\U00xxxxxx' */
5920 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005921 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005922 *p++ = '\\';
5923 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005924 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5925 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5926 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5927 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5928 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5929 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5930 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5931 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005933 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005934
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005936 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 *p++ = '\\';
5938 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005939 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5940 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5941 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5942 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005944
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005945 /* Map special whitespace to '\t', \n', '\r' */
5946 else if (ch == '\t') {
5947 *p++ = '\\';
5948 *p++ = 't';
5949 }
5950 else if (ch == '\n') {
5951 *p++ = '\\';
5952 *p++ = 'n';
5953 }
5954 else if (ch == '\r') {
5955 *p++ = '\\';
5956 *p++ = 'r';
5957 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005958
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005959 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005960 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005962 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005963 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5964 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005965 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005966
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 /* Copy everything else as-is */
5968 else
5969 *p++ = (char) ch;
5970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005971
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005972 assert(p - PyBytes_AS_STRING(repr) > 0);
5973 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5974 return NULL;
5975 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976}
5977
Alexander Belopolsky40018472011-02-26 01:02:56 +00005978PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005979PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5980 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005982 PyObject *result;
5983 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5984 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005986 result = PyUnicode_AsUnicodeEscapeString(tmp);
5987 Py_DECREF(tmp);
5988 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989}
5990
5991/* --- Raw Unicode Escape Codec ------------------------------------------- */
5992
Alexander Belopolsky40018472011-02-26 01:02:56 +00005993PyObject *
5994PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005995 Py_ssize_t size,
5996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005999 Py_ssize_t startinpos;
6000 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006001 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 const char *end;
6003 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006004 PyObject *errorHandler = NULL;
6005 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006006
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006007 if (size == 0)
6008 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006009
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 /* Escaped strings will always be longer than the resulting
6011 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006012 length after conversion to the true value. (But decoding error
6013 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006014 _PyUnicodeWriter_Init(&writer);
6015 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006016
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 end = s + size;
6018 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 unsigned char c;
6020 Py_UCS4 x;
6021 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006022 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 /* Non-escape characters are interpreted as Unicode ordinals */
6025 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006026 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006027 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006028 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006030 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 startinpos = s-starts;
6032
6033 /* \u-escapes are only interpreted iff the number of leading
6034 backslashes if odd */
6035 bs = s;
6036 for (;s < end;) {
6037 if (*s != '\\')
6038 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006039 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006040 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006041 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 }
6043 if (((s - bs) & 1) == 0 ||
6044 s >= end ||
6045 (*s != 'u' && *s != 'U')) {
6046 continue;
6047 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006048 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 count = *s=='u' ? 4 : 8;
6050 s++;
6051
6052 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 for (x = 0, i = 0; i < count; ++i, ++s) {
6054 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006055 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006057 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 errors, &errorHandler,
6059 "rawunicodeescape", "truncated \\uXXXX",
6060 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006061 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 goto onError;
6063 goto nextByte;
6064 }
6065 x = (x<<4) & ~0xF;
6066 if (c >= '0' && c <= '9')
6067 x += c - '0';
6068 else if (c >= 'a' && c <= 'f')
6069 x += 10 + c - 'a';
6070 else
6071 x += 10 + c - 'A';
6072 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006073 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006074 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006075 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006076 }
6077 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006078 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006079 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006080 errors, &errorHandler,
6081 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006083 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006085 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006086 nextByte:
6087 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006089 Py_XDECREF(errorHandler);
6090 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006091 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006092
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006094 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006095 Py_XDECREF(errorHandler);
6096 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097 return NULL;
6098}
6099
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006100
Alexander Belopolsky40018472011-02-26 01:02:56 +00006101PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006102PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006104 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 char *p;
6106 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006107 Py_ssize_t expandsize, pos;
6108 int kind;
6109 void *data;
6110 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006112 if (!PyUnicode_Check(unicode)) {
6113 PyErr_BadArgument();
6114 return NULL;
6115 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006116 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006117 return NULL;
6118 kind = PyUnicode_KIND(unicode);
6119 data = PyUnicode_DATA(unicode);
6120 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006121 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6122 bytes, and 1 byte characters 4. */
6123 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006124
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006125 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006127
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006128 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 if (repr == NULL)
6130 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006131 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006132 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006134 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006135 for (pos = 0; pos < len; pos++) {
6136 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006137 /* Map 32-bit characters to '\Uxxxxxxxx' */
6138 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006139 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006140 *p++ = '\\';
6141 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006142 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6143 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6144 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6145 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6146 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6147 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6148 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6149 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006150 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 *p++ = '\\';
6154 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006155 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6156 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6157 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6158 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 /* Copy everything else as-is */
6161 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 *p++ = (char) ch;
6163 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006164
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006165 assert(p > q);
6166 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006167 return NULL;
6168 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169}
6170
Alexander Belopolsky40018472011-02-26 01:02:56 +00006171PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006172PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6173 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006175 PyObject *result;
6176 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6177 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006178 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6180 Py_DECREF(tmp);
6181 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182}
6183
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006184/* --- Unicode Internal Codec ------------------------------------------- */
6185
Alexander Belopolsky40018472011-02-26 01:02:56 +00006186PyObject *
6187_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006188 Py_ssize_t size,
6189 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006190{
6191 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006192 Py_ssize_t startinpos;
6193 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006194 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006195 const char *end;
6196 const char *reason;
6197 PyObject *errorHandler = NULL;
6198 PyObject *exc = NULL;
6199
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006200 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006201 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006202 1))
6203 return NULL;
6204
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006205 if (size == 0)
6206 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006207
Victor Stinner8f674cc2013-04-17 23:02:17 +02006208 _PyUnicodeWriter_Init(&writer);
6209 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6210 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006212 }
6213 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006214
Victor Stinner8f674cc2013-04-17 23:02:17 +02006215 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006216 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006217 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006218 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006219 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006220 endinpos = end-starts;
6221 reason = "truncated input";
6222 goto error;
6223 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006224 /* We copy the raw representation one byte at a time because the
6225 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006226 ((char *) &uch)[0] = s[0];
6227 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006228#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006229 ((char *) &uch)[2] = s[2];
6230 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006231#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006232 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006233#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006234 /* We have to sanity check the raw data, otherwise doom looms for
6235 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006236 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006237 endinpos = s - starts + Py_UNICODE_SIZE;
6238 reason = "illegal code point (> 0x10FFFF)";
6239 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006240 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006241#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006242 s += Py_UNICODE_SIZE;
6243#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006244 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006245 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006246 Py_UNICODE uch2;
6247 ((char *) &uch2)[0] = s[0];
6248 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006249 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006250 {
Victor Stinner551ac952011-11-29 22:58:13 +01006251 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006253 }
6254 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006255#endif
6256
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006257 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006258 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006259 continue;
6260
6261 error:
6262 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006263 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006264 errors, &errorHandler,
6265 "unicode_internal", reason,
6266 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006267 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006268 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006269 }
6270
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006271 Py_XDECREF(errorHandler);
6272 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006273 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006274
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006276 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 Py_XDECREF(errorHandler);
6278 Py_XDECREF(exc);
6279 return NULL;
6280}
6281
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282/* --- Latin-1 Codec ------------------------------------------------------ */
6283
Alexander Belopolsky40018472011-02-26 01:02:56 +00006284PyObject *
6285PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006286 Py_ssize_t size,
6287 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006290 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291}
6292
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006294static void
6295make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006296 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006297 PyObject *unicode,
6298 Py_ssize_t startpos, Py_ssize_t endpos,
6299 const char *reason)
6300{
6301 if (*exceptionObject == NULL) {
6302 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006303 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006304 encoding, unicode, startpos, endpos, reason);
6305 }
6306 else {
6307 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6308 goto onError;
6309 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6310 goto onError;
6311 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6312 goto onError;
6313 return;
6314 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006315 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006316 }
6317}
6318
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006319/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006320static void
6321raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006322 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006323 PyObject *unicode,
6324 Py_ssize_t startpos, Py_ssize_t endpos,
6325 const char *reason)
6326{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006327 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006328 encoding, unicode, startpos, endpos, reason);
6329 if (*exceptionObject != NULL)
6330 PyCodec_StrictErrors(*exceptionObject);
6331}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006332
6333/* error handling callback helper:
6334 build arguments, call the callback and check the arguments,
6335 put the result into newpos and return the replacement string, which
6336 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006337static PyObject *
6338unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006339 PyObject **errorHandler,
6340 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006341 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006342 Py_ssize_t startpos, Py_ssize_t endpos,
6343 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006345 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006346 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347 PyObject *restuple;
6348 PyObject *resunicode;
6349
6350 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006354 }
6355
Benjamin Petersonbac79492012-01-14 13:34:47 -05006356 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006357 return NULL;
6358 len = PyUnicode_GET_LENGTH(unicode);
6359
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006360 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006361 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006362 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364
6365 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006369 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006370 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 Py_DECREF(restuple);
6372 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006373 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006374 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 &resunicode, newpos)) {
6376 Py_DECREF(restuple);
6377 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006378 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006379 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6380 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6381 Py_DECREF(restuple);
6382 return NULL;
6383 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006384 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006385 *newpos = len + *newpos;
6386 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006387 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 Py_DECREF(restuple);
6389 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006390 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391 Py_INCREF(resunicode);
6392 Py_DECREF(restuple);
6393 return resunicode;
6394}
6395
Alexander Belopolsky40018472011-02-26 01:02:56 +00006396static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006397unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006398 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006399 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006401 /* input state */
6402 Py_ssize_t pos=0, size;
6403 int kind;
6404 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 /* output object */
6406 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 /* pointer into the output */
6408 char *str;
6409 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006410 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006411 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6412 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413 PyObject *errorHandler = NULL;
6414 PyObject *exc = NULL;
6415 /* the following variable is used for caching string comparisons
6416 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6417 int known_errorHandler = -1;
6418
Benjamin Petersonbac79492012-01-14 13:34:47 -05006419 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 return NULL;
6421 size = PyUnicode_GET_LENGTH(unicode);
6422 kind = PyUnicode_KIND(unicode);
6423 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 /* allocate enough for a simple encoding without
6425 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006426 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006427 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006428 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006429 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006430 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006431 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 ressize = size;
6433
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006434 while (pos < size) {
6435 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 /* can we encode this? */
6438 if (c<limit) {
6439 /* no overflow check, because we know that the space is enough */
6440 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006441 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006442 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 Py_ssize_t requiredsize;
6445 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006446 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006448 Py_ssize_t collstart = pos;
6449 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006451 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 ++collend;
6453 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6454 if (known_errorHandler==-1) {
6455 if ((errors==NULL) || (!strcmp(errors, "strict")))
6456 known_errorHandler = 1;
6457 else if (!strcmp(errors, "replace"))
6458 known_errorHandler = 2;
6459 else if (!strcmp(errors, "ignore"))
6460 known_errorHandler = 3;
6461 else if (!strcmp(errors, "xmlcharrefreplace"))
6462 known_errorHandler = 4;
6463 else
6464 known_errorHandler = 0;
6465 }
6466 switch (known_errorHandler) {
6467 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006468 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 goto onError;
6470 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006471 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 *str++ = '?'; /* fall through */
6473 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 break;
6476 case 4: /* xmlcharrefreplace */
6477 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006478 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006480 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006481 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006482 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006483 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006484 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006485 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006486 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006487 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006488 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006489 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006490 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006492 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006494 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006495 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006496 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006497 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006498 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006499 if (requiredsize > PY_SSIZE_T_MAX - incr)
6500 goto overflow;
6501 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006503 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6504 goto overflow;
6505 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006507 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 requiredsize = 2*ressize;
6509 if (_PyBytes_Resize(&res, requiredsize))
6510 goto onError;
6511 str = PyBytes_AS_STRING(res) + respos;
6512 ressize = requiredsize;
6513 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006514 /* generate replacement */
6515 for (i = collstart; i < collend; ++i) {
6516 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 break;
6520 default:
6521 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006522 encoding, reason, unicode, &exc,
6523 collstart, collend, &newpos);
6524 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006525 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006526 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006527 if (PyBytes_Check(repunicode)) {
6528 /* Directly copy bytes result to output. */
6529 repsize = PyBytes_Size(repunicode);
6530 if (repsize > 1) {
6531 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006532 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006533 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6534 Py_DECREF(repunicode);
6535 goto overflow;
6536 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006537 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6538 Py_DECREF(repunicode);
6539 goto onError;
6540 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006541 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006542 ressize += repsize-1;
6543 }
6544 memcpy(str, PyBytes_AsString(repunicode), repsize);
6545 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006546 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006547 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006548 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006549 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 /* need more space? (at least enough for what we
6551 have+the replacement+the rest of the string, so
6552 we won't have to check space for encodable characters) */
6553 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006554 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006555 requiredsize = respos;
6556 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6557 goto overflow;
6558 requiredsize += repsize;
6559 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6560 goto overflow;
6561 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006563 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 requiredsize = 2*ressize;
6565 if (_PyBytes_Resize(&res, requiredsize)) {
6566 Py_DECREF(repunicode);
6567 goto onError;
6568 }
6569 str = PyBytes_AS_STRING(res) + respos;
6570 ressize = requiredsize;
6571 }
6572 /* check if there is anything unencodable in the replacement
6573 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006574 for (i = 0; repsize-->0; ++i, ++str) {
6575 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006577 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006578 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 Py_DECREF(repunicode);
6580 goto onError;
6581 }
6582 *str = (char)c;
6583 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006584 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006585 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006587 }
6588 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006589 /* Resize if we allocated to much */
6590 size = str - PyBytes_AS_STRING(res);
6591 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006592 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006593 if (_PyBytes_Resize(&res, size) < 0)
6594 goto onError;
6595 }
6596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006597 Py_XDECREF(errorHandler);
6598 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006599 return res;
6600
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006601 overflow:
6602 PyErr_SetString(PyExc_OverflowError,
6603 "encoded result is too long for a Python string");
6604
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006605 onError:
6606 Py_XDECREF(res);
6607 Py_XDECREF(errorHandler);
6608 Py_XDECREF(exc);
6609 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006610}
6611
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006613PyObject *
6614PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006615 Py_ssize_t size,
6616 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006618 PyObject *result;
6619 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6620 if (unicode == NULL)
6621 return NULL;
6622 result = unicode_encode_ucs1(unicode, errors, 256);
6623 Py_DECREF(unicode);
6624 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625}
6626
Alexander Belopolsky40018472011-02-26 01:02:56 +00006627PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006628_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629{
6630 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 PyErr_BadArgument();
6632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006634 if (PyUnicode_READY(unicode) == -1)
6635 return NULL;
6636 /* Fast path: if it is a one-byte string, construct
6637 bytes object directly. */
6638 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6639 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6640 PyUnicode_GET_LENGTH(unicode));
6641 /* Non-Latin-1 characters present. Defer to above function to
6642 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006643 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006644}
6645
6646PyObject*
6647PyUnicode_AsLatin1String(PyObject *unicode)
6648{
6649 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006650}
6651
6652/* --- 7-bit ASCII Codec -------------------------------------------------- */
6653
Alexander Belopolsky40018472011-02-26 01:02:56 +00006654PyObject *
6655PyUnicode_DecodeASCII(const char *s,
6656 Py_ssize_t size,
6657 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006659 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006660 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006661 int kind;
6662 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006663 Py_ssize_t startinpos;
6664 Py_ssize_t endinpos;
6665 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006666 const char *e;
6667 PyObject *errorHandler = NULL;
6668 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006669
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006671 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006672
Guido van Rossumd57fd912000-03-10 22:53:23 +00006673 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006674 if (size == 1 && (unsigned char)s[0] < 128)
6675 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006676
Victor Stinner8f674cc2013-04-17 23:02:17 +02006677 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006678 writer.min_length = size;
6679 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006680 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006681
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006683 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006684 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006685 writer.pos = outpos;
6686 if (writer.pos == size)
6687 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006688
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006689 s += writer.pos;
6690 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006691 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006692 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006694 PyUnicode_WRITE(kind, data, writer.pos, c);
6695 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 ++s;
6697 }
6698 else {
6699 startinpos = s-starts;
6700 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006701 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 errors, &errorHandler,
6703 "ascii", "ordinal not in range(128)",
6704 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006705 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006707 kind = writer.kind;
6708 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711 Py_XDECREF(errorHandler);
6712 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006713 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006714
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006716 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 Py_XDECREF(errorHandler);
6718 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 return NULL;
6720}
6721
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006723PyObject *
6724PyUnicode_EncodeASCII(const Py_UNICODE *p,
6725 Py_ssize_t size,
6726 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006728 PyObject *result;
6729 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6730 if (unicode == NULL)
6731 return NULL;
6732 result = unicode_encode_ucs1(unicode, errors, 128);
6733 Py_DECREF(unicode);
6734 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735}
6736
Alexander Belopolsky40018472011-02-26 01:02:56 +00006737PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006738_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739{
6740 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 PyErr_BadArgument();
6742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006744 if (PyUnicode_READY(unicode) == -1)
6745 return NULL;
6746 /* Fast path: if it is an ASCII-only string, construct bytes object
6747 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006748 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006749 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6750 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006751 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006752}
6753
6754PyObject *
6755PyUnicode_AsASCIIString(PyObject *unicode)
6756{
6757 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758}
6759
Victor Stinner99b95382011-07-04 14:23:54 +02006760#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006761
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006762/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006763
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006764#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006765#define NEED_RETRY
6766#endif
6767
Victor Stinner3a50e702011-10-18 21:21:00 +02006768#ifndef WC_ERR_INVALID_CHARS
6769# define WC_ERR_INVALID_CHARS 0x0080
6770#endif
6771
6772static char*
6773code_page_name(UINT code_page, PyObject **obj)
6774{
6775 *obj = NULL;
6776 if (code_page == CP_ACP)
6777 return "mbcs";
6778 if (code_page == CP_UTF7)
6779 return "CP_UTF7";
6780 if (code_page == CP_UTF8)
6781 return "CP_UTF8";
6782
6783 *obj = PyBytes_FromFormat("cp%u", code_page);
6784 if (*obj == NULL)
6785 return NULL;
6786 return PyBytes_AS_STRING(*obj);
6787}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006788
Victor Stinner3a50e702011-10-18 21:21:00 +02006789static DWORD
6790decode_code_page_flags(UINT code_page)
6791{
6792 if (code_page == CP_UTF7) {
6793 /* The CP_UTF7 decoder only supports flags=0 */
6794 return 0;
6795 }
6796 else
6797 return MB_ERR_INVALID_CHARS;
6798}
6799
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006800/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006801 * Decode a byte string from a Windows code page into unicode object in strict
6802 * mode.
6803 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006804 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6805 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006807static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006808decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006809 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006810 const char *in,
6811 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006812{
Victor Stinner3a50e702011-10-18 21:21:00 +02006813 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006814 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006815 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006816
6817 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006818 assert(insize > 0);
6819 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6820 if (outsize <= 0)
6821 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006822
6823 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006825 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006826 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 if (*v == NULL)
6828 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006829 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006830 }
6831 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006833 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006834 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006836 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006837 }
6838
6839 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006840 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6841 if (outsize <= 0)
6842 goto error;
6843 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006844
Victor Stinner3a50e702011-10-18 21:21:00 +02006845error:
6846 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6847 return -2;
6848 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006849 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006850}
6851
Victor Stinner3a50e702011-10-18 21:21:00 +02006852/*
6853 * Decode a byte string from a code page into unicode object with an error
6854 * handler.
6855 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006856 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006857 * UnicodeDecodeError exception and returns -1 on error.
6858 */
6859static int
6860decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006861 PyObject **v,
6862 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006863 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006864{
6865 const char *startin = in;
6866 const char *endin = in + size;
6867 const DWORD flags = decode_code_page_flags(code_page);
6868 /* Ideally, we should get reason from FormatMessage. This is the Windows
6869 2000 English version of the message. */
6870 const char *reason = "No mapping for the Unicode character exists "
6871 "in the target code page.";
6872 /* each step cannot decode more than 1 character, but a character can be
6873 represented as a surrogate pair */
6874 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006875 int insize;
6876 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006877 PyObject *errorHandler = NULL;
6878 PyObject *exc = NULL;
6879 PyObject *encoding_obj = NULL;
6880 char *encoding;
6881 DWORD err;
6882 int ret = -1;
6883
6884 assert(size > 0);
6885
6886 encoding = code_page_name(code_page, &encoding_obj);
6887 if (encoding == NULL)
6888 return -1;
6889
Victor Stinner7d00cc12014-03-17 23:08:06 +01006890 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006891 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6892 UnicodeDecodeError. */
6893 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6894 if (exc != NULL) {
6895 PyCodec_StrictErrors(exc);
6896 Py_CLEAR(exc);
6897 }
6898 goto error;
6899 }
6900
6901 if (*v == NULL) {
6902 /* Create unicode object */
6903 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6904 PyErr_NoMemory();
6905 goto error;
6906 }
Victor Stinnerab595942011-12-17 04:59:06 +01006907 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006908 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006909 if (*v == NULL)
6910 goto error;
6911 startout = PyUnicode_AS_UNICODE(*v);
6912 }
6913 else {
6914 /* Extend unicode object */
6915 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6916 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6917 PyErr_NoMemory();
6918 goto error;
6919 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006920 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 goto error;
6922 startout = PyUnicode_AS_UNICODE(*v) + n;
6923 }
6924
6925 /* Decode the byte string character per character */
6926 out = startout;
6927 while (in < endin)
6928 {
6929 /* Decode a character */
6930 insize = 1;
6931 do
6932 {
6933 outsize = MultiByteToWideChar(code_page, flags,
6934 in, insize,
6935 buffer, Py_ARRAY_LENGTH(buffer));
6936 if (outsize > 0)
6937 break;
6938 err = GetLastError();
6939 if (err != ERROR_NO_UNICODE_TRANSLATION
6940 && err != ERROR_INSUFFICIENT_BUFFER)
6941 {
6942 PyErr_SetFromWindowsErr(0);
6943 goto error;
6944 }
6945 insize++;
6946 }
6947 /* 4=maximum length of a UTF-8 sequence */
6948 while (insize <= 4 && (in + insize) <= endin);
6949
6950 if (outsize <= 0) {
6951 Py_ssize_t startinpos, endinpos, outpos;
6952
Victor Stinner7d00cc12014-03-17 23:08:06 +01006953 /* last character in partial decode? */
6954 if (in + insize >= endin && !final)
6955 break;
6956
Victor Stinner3a50e702011-10-18 21:21:00 +02006957 startinpos = in - startin;
6958 endinpos = startinpos + 1;
6959 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006960 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006961 errors, &errorHandler,
6962 encoding, reason,
6963 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006964 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006965 {
6966 goto error;
6967 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006968 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006969 }
6970 else {
6971 in += insize;
6972 memcpy(out, buffer, outsize * sizeof(wchar_t));
6973 out += outsize;
6974 }
6975 }
6976
6977 /* write a NUL character at the end */
6978 *out = 0;
6979
6980 /* Extend unicode object */
6981 outsize = out - startout;
6982 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006983 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006984 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02006985 /* (in - startin) <= size and size is an int */
6986 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02006987
6988error:
6989 Py_XDECREF(encoding_obj);
6990 Py_XDECREF(errorHandler);
6991 Py_XDECREF(exc);
6992 return ret;
6993}
6994
Victor Stinner3a50e702011-10-18 21:21:00 +02006995static PyObject *
6996decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006997 const char *s, Py_ssize_t size,
6998 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006999{
Victor Stinner76a31a62011-11-04 00:05:13 +01007000 PyObject *v = NULL;
7001 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007002
Victor Stinner3a50e702011-10-18 21:21:00 +02007003 if (code_page < 0) {
7004 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7005 return NULL;
7006 }
7007
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007008 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007009 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007010
Victor Stinner76a31a62011-11-04 00:05:13 +01007011 do
7012 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007013#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007014 if (size > INT_MAX) {
7015 chunk_size = INT_MAX;
7016 final = 0;
7017 done = 0;
7018 }
7019 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007021 {
7022 chunk_size = (int)size;
7023 final = (consumed == NULL);
7024 done = 1;
7025 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026
Victor Stinner76a31a62011-11-04 00:05:13 +01007027 if (chunk_size == 0 && done) {
7028 if (v != NULL)
7029 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007030 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007031 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007032
Victor Stinner76a31a62011-11-04 00:05:13 +01007033 converted = decode_code_page_strict(code_page, &v,
7034 s, chunk_size);
7035 if (converted == -2)
7036 converted = decode_code_page_errors(code_page, &v,
7037 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007038 errors, final);
7039 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007040
7041 if (converted < 0) {
7042 Py_XDECREF(v);
7043 return NULL;
7044 }
7045
7046 if (consumed)
7047 *consumed += converted;
7048
7049 s += converted;
7050 size -= converted;
7051 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007052
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007053 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007054}
7055
Alexander Belopolsky40018472011-02-26 01:02:56 +00007056PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007057PyUnicode_DecodeCodePageStateful(int code_page,
7058 const char *s,
7059 Py_ssize_t size,
7060 const char *errors,
7061 Py_ssize_t *consumed)
7062{
7063 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7064}
7065
7066PyObject *
7067PyUnicode_DecodeMBCSStateful(const char *s,
7068 Py_ssize_t size,
7069 const char *errors,
7070 Py_ssize_t *consumed)
7071{
7072 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7073}
7074
7075PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007076PyUnicode_DecodeMBCS(const char *s,
7077 Py_ssize_t size,
7078 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007079{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007080 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7081}
7082
Victor Stinner3a50e702011-10-18 21:21:00 +02007083static DWORD
7084encode_code_page_flags(UINT code_page, const char *errors)
7085{
7086 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007087 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007088 }
7089 else if (code_page == CP_UTF7) {
7090 /* CP_UTF7 only supports flags=0 */
7091 return 0;
7092 }
7093 else {
7094 if (errors != NULL && strcmp(errors, "replace") == 0)
7095 return 0;
7096 else
7097 return WC_NO_BEST_FIT_CHARS;
7098 }
7099}
7100
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007101/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007102 * Encode a Unicode string to a Windows code page into a byte string in strict
7103 * mode.
7104 *
7105 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007106 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007108static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007109encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007110 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007112{
Victor Stinner554f3f02010-06-16 23:33:54 +00007113 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007114 BOOL *pusedDefaultChar = &usedDefaultChar;
7115 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007116 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007117 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007118 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 const DWORD flags = encode_code_page_flags(code_page, NULL);
7120 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007121 /* Create a substring so that we can get the UTF-16 representation
7122 of just the slice under consideration. */
7123 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007124
Martin v. Löwis3d325192011-11-04 18:23:06 +01007125 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007126
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007128 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007129 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007130 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007131
Victor Stinner2fc507f2011-11-04 20:06:39 +01007132 substring = PyUnicode_Substring(unicode, offset, offset+len);
7133 if (substring == NULL)
7134 return -1;
7135 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7136 if (p == NULL) {
7137 Py_DECREF(substring);
7138 return -1;
7139 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007140 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007141
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007142 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007144 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007145 NULL, 0,
7146 NULL, pusedDefaultChar);
7147 if (outsize <= 0)
7148 goto error;
7149 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007150 if (pusedDefaultChar && *pusedDefaultChar) {
7151 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007152 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007153 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007154
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007156 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007157 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007158 if (*outbytes == NULL) {
7159 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007160 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007161 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007163 }
7164 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007165 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 const Py_ssize_t n = PyBytes_Size(*outbytes);
7167 if (outsize > PY_SSIZE_T_MAX - n) {
7168 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007169 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007170 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007172 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7173 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007175 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007177 }
7178
7179 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007181 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 out, outsize,
7183 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007184 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007185 if (outsize <= 0)
7186 goto error;
7187 if (pusedDefaultChar && *pusedDefaultChar)
7188 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007189 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007190
Victor Stinner3a50e702011-10-18 21:21:00 +02007191error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007192 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7194 return -2;
7195 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007196 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007197}
7198
Victor Stinner3a50e702011-10-18 21:21:00 +02007199/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007200 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007201 * error handler.
7202 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007203 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 * -1 on other error.
7205 */
7206static int
7207encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007208 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007209 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007210{
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007212 Py_ssize_t pos = unicode_offset;
7213 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 /* Ideally, we should get reason from FormatMessage. This is the Windows
7215 2000 English version of the message. */
7216 const char *reason = "invalid character";
7217 /* 4=maximum length of a UTF-8 sequence */
7218 char buffer[4];
7219 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7220 Py_ssize_t outsize;
7221 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 PyObject *errorHandler = NULL;
7223 PyObject *exc = NULL;
7224 PyObject *encoding_obj = NULL;
7225 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007226 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 PyObject *rep;
7228 int ret = -1;
7229
7230 assert(insize > 0);
7231
7232 encoding = code_page_name(code_page, &encoding_obj);
7233 if (encoding == NULL)
7234 return -1;
7235
7236 if (errors == NULL || strcmp(errors, "strict") == 0) {
7237 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7238 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007239 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 if (exc != NULL) {
7241 PyCodec_StrictErrors(exc);
7242 Py_DECREF(exc);
7243 }
7244 Py_XDECREF(encoding_obj);
7245 return -1;
7246 }
7247
7248 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7249 pusedDefaultChar = &usedDefaultChar;
7250 else
7251 pusedDefaultChar = NULL;
7252
7253 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7254 PyErr_NoMemory();
7255 goto error;
7256 }
7257 outsize = insize * Py_ARRAY_LENGTH(buffer);
7258
7259 if (*outbytes == NULL) {
7260 /* Create string object */
7261 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7262 if (*outbytes == NULL)
7263 goto error;
7264 out = PyBytes_AS_STRING(*outbytes);
7265 }
7266 else {
7267 /* Extend string object */
7268 Py_ssize_t n = PyBytes_Size(*outbytes);
7269 if (n > PY_SSIZE_T_MAX - outsize) {
7270 PyErr_NoMemory();
7271 goto error;
7272 }
7273 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7274 goto error;
7275 out = PyBytes_AS_STRING(*outbytes) + n;
7276 }
7277
7278 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007279 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007280 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007281 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7282 wchar_t chars[2];
7283 int charsize;
7284 if (ch < 0x10000) {
7285 chars[0] = (wchar_t)ch;
7286 charsize = 1;
7287 }
7288 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007289 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7290 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007291 charsize = 2;
7292 }
7293
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007295 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 buffer, Py_ARRAY_LENGTH(buffer),
7297 NULL, pusedDefaultChar);
7298 if (outsize > 0) {
7299 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7300 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007301 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 memcpy(out, buffer, outsize);
7303 out += outsize;
7304 continue;
7305 }
7306 }
7307 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7308 PyErr_SetFromWindowsErr(0);
7309 goto error;
7310 }
7311
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 rep = unicode_encode_call_errorhandler(
7313 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007314 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007315 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 if (rep == NULL)
7317 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007318 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007319
7320 if (PyBytes_Check(rep)) {
7321 outsize = PyBytes_GET_SIZE(rep);
7322 if (outsize != 1) {
7323 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7324 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7325 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7326 Py_DECREF(rep);
7327 goto error;
7328 }
7329 out = PyBytes_AS_STRING(*outbytes) + offset;
7330 }
7331 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7332 out += outsize;
7333 }
7334 else {
7335 Py_ssize_t i;
7336 enum PyUnicode_Kind kind;
7337 void *data;
7338
Benjamin Petersonbac79492012-01-14 13:34:47 -05007339 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007340 Py_DECREF(rep);
7341 goto error;
7342 }
7343
7344 outsize = PyUnicode_GET_LENGTH(rep);
7345 if (outsize != 1) {
7346 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7347 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7348 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7349 Py_DECREF(rep);
7350 goto error;
7351 }
7352 out = PyBytes_AS_STRING(*outbytes) + offset;
7353 }
7354 kind = PyUnicode_KIND(rep);
7355 data = PyUnicode_DATA(rep);
7356 for (i=0; i < outsize; i++) {
7357 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7358 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007359 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007360 encoding, unicode,
7361 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007362 "unable to encode error handler result to ASCII");
7363 Py_DECREF(rep);
7364 goto error;
7365 }
7366 *out = (unsigned char)ch;
7367 out++;
7368 }
7369 }
7370 Py_DECREF(rep);
7371 }
7372 /* write a NUL byte */
7373 *out = 0;
7374 outsize = out - PyBytes_AS_STRING(*outbytes);
7375 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7376 if (_PyBytes_Resize(outbytes, outsize) < 0)
7377 goto error;
7378 ret = 0;
7379
7380error:
7381 Py_XDECREF(encoding_obj);
7382 Py_XDECREF(errorHandler);
7383 Py_XDECREF(exc);
7384 return ret;
7385}
7386
Victor Stinner3a50e702011-10-18 21:21:00 +02007387static PyObject *
7388encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007389 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007390 const char *errors)
7391{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007392 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007394 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007395 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007396
Victor Stinner29dacf22015-01-26 16:41:32 +01007397 if (!PyUnicode_Check(unicode)) {
7398 PyErr_BadArgument();
7399 return NULL;
7400 }
7401
Benjamin Petersonbac79492012-01-14 13:34:47 -05007402 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007403 return NULL;
7404 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007405
Victor Stinner3a50e702011-10-18 21:21:00 +02007406 if (code_page < 0) {
7407 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7408 return NULL;
7409 }
7410
Martin v. Löwis3d325192011-11-04 18:23:06 +01007411 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007412 return PyBytes_FromStringAndSize(NULL, 0);
7413
Victor Stinner7581cef2011-11-03 22:32:33 +01007414 offset = 0;
7415 do
7416 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007417#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007418 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007419 chunks. */
7420 if (len > INT_MAX/2) {
7421 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007422 done = 0;
7423 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007424 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007425#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007426 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007427 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007428 done = 1;
7429 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007430
Victor Stinner76a31a62011-11-04 00:05:13 +01007431 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007432 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007433 errors);
7434 if (ret == -2)
7435 ret = encode_code_page_errors(code_page, &outbytes,
7436 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007437 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007438 if (ret < 0) {
7439 Py_XDECREF(outbytes);
7440 return NULL;
7441 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007442
Victor Stinner7581cef2011-11-03 22:32:33 +01007443 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007444 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007445 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007446
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 return outbytes;
7448}
7449
7450PyObject *
7451PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7452 Py_ssize_t size,
7453 const char *errors)
7454{
Victor Stinner7581cef2011-11-03 22:32:33 +01007455 PyObject *unicode, *res;
7456 unicode = PyUnicode_FromUnicode(p, size);
7457 if (unicode == NULL)
7458 return NULL;
7459 res = encode_code_page(CP_ACP, unicode, errors);
7460 Py_DECREF(unicode);
7461 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007462}
7463
7464PyObject *
7465PyUnicode_EncodeCodePage(int code_page,
7466 PyObject *unicode,
7467 const char *errors)
7468{
Victor Stinner7581cef2011-11-03 22:32:33 +01007469 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007470}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007471
Alexander Belopolsky40018472011-02-26 01:02:56 +00007472PyObject *
7473PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007474{
Victor Stinner7581cef2011-11-03 22:32:33 +01007475 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007476}
7477
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007478#undef NEED_RETRY
7479
Victor Stinner99b95382011-07-04 14:23:54 +02007480#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007481
Guido van Rossumd57fd912000-03-10 22:53:23 +00007482/* --- Character Mapping Codec -------------------------------------------- */
7483
Victor Stinnerfb161b12013-04-18 01:44:27 +02007484static int
7485charmap_decode_string(const char *s,
7486 Py_ssize_t size,
7487 PyObject *mapping,
7488 const char *errors,
7489 _PyUnicodeWriter *writer)
7490{
7491 const char *starts = s;
7492 const char *e;
7493 Py_ssize_t startinpos, endinpos;
7494 PyObject *errorHandler = NULL, *exc = NULL;
7495 Py_ssize_t maplen;
7496 enum PyUnicode_Kind mapkind;
7497 void *mapdata;
7498 Py_UCS4 x;
7499 unsigned char ch;
7500
7501 if (PyUnicode_READY(mapping) == -1)
7502 return -1;
7503
7504 maplen = PyUnicode_GET_LENGTH(mapping);
7505 mapdata = PyUnicode_DATA(mapping);
7506 mapkind = PyUnicode_KIND(mapping);
7507
7508 e = s + size;
7509
7510 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7511 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7512 * is disabled in encoding aliases, latin1 is preferred because
7513 * its implementation is faster. */
7514 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7515 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7516 Py_UCS4 maxchar = writer->maxchar;
7517
7518 assert (writer->kind == PyUnicode_1BYTE_KIND);
7519 while (s < e) {
7520 ch = *s;
7521 x = mapdata_ucs1[ch];
7522 if (x > maxchar) {
7523 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7524 goto onError;
7525 maxchar = writer->maxchar;
7526 outdata = (Py_UCS1 *)writer->data;
7527 }
7528 outdata[writer->pos] = x;
7529 writer->pos++;
7530 ++s;
7531 }
7532 return 0;
7533 }
7534
7535 while (s < e) {
7536 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7537 enum PyUnicode_Kind outkind = writer->kind;
7538 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7539 if (outkind == PyUnicode_1BYTE_KIND) {
7540 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7541 Py_UCS4 maxchar = writer->maxchar;
7542 while (s < e) {
7543 ch = *s;
7544 x = mapdata_ucs2[ch];
7545 if (x > maxchar)
7546 goto Error;
7547 outdata[writer->pos] = x;
7548 writer->pos++;
7549 ++s;
7550 }
7551 break;
7552 }
7553 else if (outkind == PyUnicode_2BYTE_KIND) {
7554 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7555 while (s < e) {
7556 ch = *s;
7557 x = mapdata_ucs2[ch];
7558 if (x == 0xFFFE)
7559 goto Error;
7560 outdata[writer->pos] = x;
7561 writer->pos++;
7562 ++s;
7563 }
7564 break;
7565 }
7566 }
7567 ch = *s;
7568
7569 if (ch < maplen)
7570 x = PyUnicode_READ(mapkind, mapdata, ch);
7571 else
7572 x = 0xfffe; /* invalid value */
7573Error:
7574 if (x == 0xfffe)
7575 {
7576 /* undefined mapping */
7577 startinpos = s-starts;
7578 endinpos = startinpos+1;
7579 if (unicode_decode_call_errorhandler_writer(
7580 errors, &errorHandler,
7581 "charmap", "character maps to <undefined>",
7582 &starts, &e, &startinpos, &endinpos, &exc, &s,
7583 writer)) {
7584 goto onError;
7585 }
7586 continue;
7587 }
7588
7589 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7590 goto onError;
7591 ++s;
7592 }
7593 Py_XDECREF(errorHandler);
7594 Py_XDECREF(exc);
7595 return 0;
7596
7597onError:
7598 Py_XDECREF(errorHandler);
7599 Py_XDECREF(exc);
7600 return -1;
7601}
7602
7603static int
7604charmap_decode_mapping(const char *s,
7605 Py_ssize_t size,
7606 PyObject *mapping,
7607 const char *errors,
7608 _PyUnicodeWriter *writer)
7609{
7610 const char *starts = s;
7611 const char *e;
7612 Py_ssize_t startinpos, endinpos;
7613 PyObject *errorHandler = NULL, *exc = NULL;
7614 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007615 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007616
7617 e = s + size;
7618
7619 while (s < e) {
7620 ch = *s;
7621
7622 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7623 key = PyLong_FromLong((long)ch);
7624 if (key == NULL)
7625 goto onError;
7626
7627 item = PyObject_GetItem(mapping, key);
7628 Py_DECREF(key);
7629 if (item == NULL) {
7630 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7631 /* No mapping found means: mapping is undefined. */
7632 PyErr_Clear();
7633 goto Undefined;
7634 } else
7635 goto onError;
7636 }
7637
7638 /* Apply mapping */
7639 if (item == Py_None)
7640 goto Undefined;
7641 if (PyLong_Check(item)) {
7642 long value = PyLong_AS_LONG(item);
7643 if (value == 0xFFFE)
7644 goto Undefined;
7645 if (value < 0 || value > MAX_UNICODE) {
7646 PyErr_Format(PyExc_TypeError,
7647 "character mapping must be in range(0x%lx)",
7648 (unsigned long)MAX_UNICODE + 1);
7649 goto onError;
7650 }
7651
7652 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7653 goto onError;
7654 }
7655 else if (PyUnicode_Check(item)) {
7656 if (PyUnicode_READY(item) == -1)
7657 goto onError;
7658 if (PyUnicode_GET_LENGTH(item) == 1) {
7659 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7660 if (value == 0xFFFE)
7661 goto Undefined;
7662 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7663 goto onError;
7664 }
7665 else {
7666 writer->overallocate = 1;
7667 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7668 goto onError;
7669 }
7670 }
7671 else {
7672 /* wrong return value */
7673 PyErr_SetString(PyExc_TypeError,
7674 "character mapping must return integer, None or str");
7675 goto onError;
7676 }
7677 Py_CLEAR(item);
7678 ++s;
7679 continue;
7680
7681Undefined:
7682 /* undefined mapping */
7683 Py_CLEAR(item);
7684 startinpos = s-starts;
7685 endinpos = startinpos+1;
7686 if (unicode_decode_call_errorhandler_writer(
7687 errors, &errorHandler,
7688 "charmap", "character maps to <undefined>",
7689 &starts, &e, &startinpos, &endinpos, &exc, &s,
7690 writer)) {
7691 goto onError;
7692 }
7693 }
7694 Py_XDECREF(errorHandler);
7695 Py_XDECREF(exc);
7696 return 0;
7697
7698onError:
7699 Py_XDECREF(item);
7700 Py_XDECREF(errorHandler);
7701 Py_XDECREF(exc);
7702 return -1;
7703}
7704
Alexander Belopolsky40018472011-02-26 01:02:56 +00007705PyObject *
7706PyUnicode_DecodeCharmap(const char *s,
7707 Py_ssize_t size,
7708 PyObject *mapping,
7709 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007711 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007712
Guido van Rossumd57fd912000-03-10 22:53:23 +00007713 /* Default to Latin-1 */
7714 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007715 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716
Guido van Rossumd57fd912000-03-10 22:53:23 +00007717 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007718 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007719 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007720 writer.min_length = size;
7721 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007722 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007723
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007724 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007725 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7726 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007727 }
7728 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007729 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7730 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007731 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007732 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007733
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007735 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736 return NULL;
7737}
7738
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007739/* Charmap encoding: the lookup table */
7740
Alexander Belopolsky40018472011-02-26 01:02:56 +00007741struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 PyObject_HEAD
7743 unsigned char level1[32];
7744 int count2, count3;
7745 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007746};
7747
7748static PyObject*
7749encoding_map_size(PyObject *obj, PyObject* args)
7750{
7751 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007752 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007754}
7755
7756static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007757 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007758 PyDoc_STR("Return the size (in bytes) of this object") },
7759 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007760};
7761
7762static void
7763encoding_map_dealloc(PyObject* o)
7764{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007765 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766}
7767
7768static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007769 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 "EncodingMap", /*tp_name*/
7771 sizeof(struct encoding_map), /*tp_basicsize*/
7772 0, /*tp_itemsize*/
7773 /* methods */
7774 encoding_map_dealloc, /*tp_dealloc*/
7775 0, /*tp_print*/
7776 0, /*tp_getattr*/
7777 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007778 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 0, /*tp_repr*/
7780 0, /*tp_as_number*/
7781 0, /*tp_as_sequence*/
7782 0, /*tp_as_mapping*/
7783 0, /*tp_hash*/
7784 0, /*tp_call*/
7785 0, /*tp_str*/
7786 0, /*tp_getattro*/
7787 0, /*tp_setattro*/
7788 0, /*tp_as_buffer*/
7789 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7790 0, /*tp_doc*/
7791 0, /*tp_traverse*/
7792 0, /*tp_clear*/
7793 0, /*tp_richcompare*/
7794 0, /*tp_weaklistoffset*/
7795 0, /*tp_iter*/
7796 0, /*tp_iternext*/
7797 encoding_map_methods, /*tp_methods*/
7798 0, /*tp_members*/
7799 0, /*tp_getset*/
7800 0, /*tp_base*/
7801 0, /*tp_dict*/
7802 0, /*tp_descr_get*/
7803 0, /*tp_descr_set*/
7804 0, /*tp_dictoffset*/
7805 0, /*tp_init*/
7806 0, /*tp_alloc*/
7807 0, /*tp_new*/
7808 0, /*tp_free*/
7809 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007810};
7811
7812PyObject*
7813PyUnicode_BuildEncodingMap(PyObject* string)
7814{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007815 PyObject *result;
7816 struct encoding_map *mresult;
7817 int i;
7818 int need_dict = 0;
7819 unsigned char level1[32];
7820 unsigned char level2[512];
7821 unsigned char *mlevel1, *mlevel2, *mlevel3;
7822 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007823 int kind;
7824 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007825 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007826 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007827
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007828 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007829 PyErr_BadArgument();
7830 return NULL;
7831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007832 kind = PyUnicode_KIND(string);
7833 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007834 length = PyUnicode_GET_LENGTH(string);
7835 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007836 memset(level1, 0xFF, sizeof level1);
7837 memset(level2, 0xFF, sizeof level2);
7838
7839 /* If there isn't a one-to-one mapping of NULL to \0,
7840 or if there are non-BMP characters, we need to use
7841 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007842 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007843 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007844 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007845 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007846 ch = PyUnicode_READ(kind, data, i);
7847 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007848 need_dict = 1;
7849 break;
7850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007852 /* unmapped character */
7853 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007854 l1 = ch >> 11;
7855 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007856 if (level1[l1] == 0xFF)
7857 level1[l1] = count2++;
7858 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007859 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007860 }
7861
7862 if (count2 >= 0xFF || count3 >= 0xFF)
7863 need_dict = 1;
7864
7865 if (need_dict) {
7866 PyObject *result = PyDict_New();
7867 PyObject *key, *value;
7868 if (!result)
7869 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007870 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007872 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007873 if (!key || !value)
7874 goto failed1;
7875 if (PyDict_SetItem(result, key, value) == -1)
7876 goto failed1;
7877 Py_DECREF(key);
7878 Py_DECREF(value);
7879 }
7880 return result;
7881 failed1:
7882 Py_XDECREF(key);
7883 Py_XDECREF(value);
7884 Py_DECREF(result);
7885 return NULL;
7886 }
7887
7888 /* Create a three-level trie */
7889 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7890 16*count2 + 128*count3 - 1);
7891 if (!result)
7892 return PyErr_NoMemory();
7893 PyObject_Init(result, &EncodingMapType);
7894 mresult = (struct encoding_map*)result;
7895 mresult->count2 = count2;
7896 mresult->count3 = count3;
7897 mlevel1 = mresult->level1;
7898 mlevel2 = mresult->level23;
7899 mlevel3 = mresult->level23 + 16*count2;
7900 memcpy(mlevel1, level1, 32);
7901 memset(mlevel2, 0xFF, 16*count2);
7902 memset(mlevel3, 0, 128*count3);
7903 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007904 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007905 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007906 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7907 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007908 /* unmapped character */
7909 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007910 o1 = ch>>11;
7911 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007912 i2 = 16*mlevel1[o1] + o2;
7913 if (mlevel2[i2] == 0xFF)
7914 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007915 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007916 i3 = 128*mlevel2[i2] + o3;
7917 mlevel3[i3] = i;
7918 }
7919 return result;
7920}
7921
7922static int
Victor Stinner22168992011-11-20 17:09:18 +01007923encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924{
7925 struct encoding_map *map = (struct encoding_map*)mapping;
7926 int l1 = c>>11;
7927 int l2 = (c>>7) & 0xF;
7928 int l3 = c & 0x7F;
7929 int i;
7930
Victor Stinner22168992011-11-20 17:09:18 +01007931 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007933 if (c == 0)
7934 return 0;
7935 /* level 1*/
7936 i = map->level1[l1];
7937 if (i == 0xFF) {
7938 return -1;
7939 }
7940 /* level 2*/
7941 i = map->level23[16*i+l2];
7942 if (i == 0xFF) {
7943 return -1;
7944 }
7945 /* level 3 */
7946 i = map->level23[16*map->count2 + 128*i + l3];
7947 if (i == 0) {
7948 return -1;
7949 }
7950 return i;
7951}
7952
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953/* Lookup the character ch in the mapping. If the character
7954 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007955 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007956static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007957charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007958{
Christian Heimes217cfd12007-12-02 14:31:20 +00007959 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007960 PyObject *x;
7961
7962 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007964 x = PyObject_GetItem(mapping, w);
7965 Py_DECREF(w);
7966 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7968 /* No mapping found means: mapping is undefined. */
7969 PyErr_Clear();
7970 x = Py_None;
7971 Py_INCREF(x);
7972 return x;
7973 } else
7974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007976 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007978 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 long value = PyLong_AS_LONG(x);
7980 if (value < 0 || value > 255) {
7981 PyErr_SetString(PyExc_TypeError,
7982 "character mapping must be in range(256)");
7983 Py_DECREF(x);
7984 return NULL;
7985 }
7986 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007988 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007990 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 /* wrong return value */
7992 PyErr_Format(PyExc_TypeError,
7993 "character mapping must return integer, bytes or None, not %.400s",
7994 x->ob_type->tp_name);
7995 Py_DECREF(x);
7996 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997 }
7998}
7999
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008000static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008001charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008002{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008003 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8004 /* exponentially overallocate to minimize reallocations */
8005 if (requiredsize < 2*outsize)
8006 requiredsize = 2*outsize;
8007 if (_PyBytes_Resize(outobj, requiredsize))
8008 return -1;
8009 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008010}
8011
Benjamin Peterson14339b62009-01-31 16:36:08 +00008012typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008014} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008015/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008016 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017 space is available. Return a new reference to the object that
8018 was put in the output buffer, or Py_None, if the mapping was undefined
8019 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008020 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008021static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008022charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008023 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008024{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008025 PyObject *rep;
8026 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008027 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008028
Christian Heimes90aa7642007-12-19 02:45:37 +00008029 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008030 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008032 if (res == -1)
8033 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 if (outsize<requiredsize)
8035 if (charmapencode_resize(outobj, outpos, requiredsize))
8036 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008037 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 outstart[(*outpos)++] = (char)res;
8039 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008040 }
8041
8042 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008043 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 Py_DECREF(rep);
8047 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 if (PyLong_Check(rep)) {
8050 Py_ssize_t requiredsize = *outpos+1;
8051 if (outsize<requiredsize)
8052 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8053 Py_DECREF(rep);
8054 return enc_EXCEPTION;
8055 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008056 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008058 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 else {
8060 const char *repchars = PyBytes_AS_STRING(rep);
8061 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8062 Py_ssize_t requiredsize = *outpos+repsize;
8063 if (outsize<requiredsize)
8064 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8065 Py_DECREF(rep);
8066 return enc_EXCEPTION;
8067 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008068 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 memcpy(outstart + *outpos, repchars, repsize);
8070 *outpos += repsize;
8071 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008072 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008073 Py_DECREF(rep);
8074 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075}
8076
8077/* handle an error in PyUnicode_EncodeCharmap
8078 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008079static int
8080charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008081 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008083 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008084 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008085{
8086 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008087 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008088 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008089 enum PyUnicode_Kind kind;
8090 void *data;
8091 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008093 Py_ssize_t collstartpos = *inpos;
8094 Py_ssize_t collendpos = *inpos+1;
8095 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008096 char *encoding = "charmap";
8097 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008099 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008100 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101
Benjamin Petersonbac79492012-01-14 13:34:47 -05008102 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008103 return -1;
8104 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008105 /* find all unencodable characters */
8106 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008108 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008109 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008110 val = encoding_map_lookup(ch, mapping);
8111 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 break;
8113 ++collendpos;
8114 continue;
8115 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008116
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008117 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8118 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 if (rep==NULL)
8120 return -1;
8121 else if (rep!=Py_None) {
8122 Py_DECREF(rep);
8123 break;
8124 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008125 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 }
8128 /* cache callback name lookup
8129 * (if not done yet, i.e. it's the first error) */
8130 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 if ((errors==NULL) || (!strcmp(errors, "strict")))
8132 *known_errorHandler = 1;
8133 else if (!strcmp(errors, "replace"))
8134 *known_errorHandler = 2;
8135 else if (!strcmp(errors, "ignore"))
8136 *known_errorHandler = 3;
8137 else if (!strcmp(errors, "xmlcharrefreplace"))
8138 *known_errorHandler = 4;
8139 else
8140 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008141 }
8142 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008143 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008144 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008145 return -1;
8146 case 2: /* replace */
8147 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 x = charmapencode_output('?', mapping, res, respos);
8149 if (x==enc_EXCEPTION) {
8150 return -1;
8151 }
8152 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008153 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008154 return -1;
8155 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008156 }
8157 /* fall through */
8158 case 3: /* ignore */
8159 *inpos = collendpos;
8160 break;
8161 case 4: /* xmlcharrefreplace */
8162 /* generate replacement (temporarily (mis)uses p) */
8163 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 char buffer[2+29+1+1];
8165 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008166 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 for (cp = buffer; *cp; ++cp) {
8168 x = charmapencode_output(*cp, mapping, res, respos);
8169 if (x==enc_EXCEPTION)
8170 return -1;
8171 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008172 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 return -1;
8174 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 }
8176 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008177 *inpos = collendpos;
8178 break;
8179 default:
8180 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008181 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008185 if (PyBytes_Check(repunicode)) {
8186 /* Directly copy bytes result to output. */
8187 Py_ssize_t outsize = PyBytes_Size(*res);
8188 Py_ssize_t requiredsize;
8189 repsize = PyBytes_Size(repunicode);
8190 requiredsize = *respos + repsize;
8191 if (requiredsize > outsize)
8192 /* Make room for all additional bytes. */
8193 if (charmapencode_resize(res, respos, requiredsize)) {
8194 Py_DECREF(repunicode);
8195 return -1;
8196 }
8197 memcpy(PyBytes_AsString(*res) + *respos,
8198 PyBytes_AsString(repunicode), repsize);
8199 *respos += repsize;
8200 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008201 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008202 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008203 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008204 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008205 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008206 Py_DECREF(repunicode);
8207 return -1;
8208 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008209 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008210 data = PyUnicode_DATA(repunicode);
8211 kind = PyUnicode_KIND(repunicode);
8212 for (index = 0; index < repsize; index++) {
8213 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8214 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008216 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 return -1;
8218 }
8219 else if (x==enc_FAILED) {
8220 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008221 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 return -1;
8223 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008224 }
8225 *inpos = newpos;
8226 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227 }
8228 return 0;
8229}
8230
Alexander Belopolsky40018472011-02-26 01:02:56 +00008231PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008232_PyUnicode_EncodeCharmap(PyObject *unicode,
8233 PyObject *mapping,
8234 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008235{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 /* output object */
8237 PyObject *res = NULL;
8238 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008239 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008240 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008242 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 PyObject *errorHandler = NULL;
8244 PyObject *exc = NULL;
8245 /* the following variable is used for caching string comparisons
8246 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8247 * 3=ignore, 4=xmlcharrefreplace */
8248 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008249 void *data;
8250 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251
Benjamin Petersonbac79492012-01-14 13:34:47 -05008252 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008253 return NULL;
8254 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008255 data = PyUnicode_DATA(unicode);
8256 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008257
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258 /* Default to Latin-1 */
8259 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008260 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 /* allocate enough for a simple encoding without
8263 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008264 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 if (res == NULL)
8266 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008267 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008269
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008271 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008273 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 if (x==enc_EXCEPTION) /* error */
8275 goto onError;
8276 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008277 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 &exc,
8279 &known_errorHandler, &errorHandler, errors,
8280 &res, &respos)) {
8281 goto onError;
8282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008283 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 else
8285 /* done with this character => adjust input position */
8286 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008290 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008291 if (_PyBytes_Resize(&res, respos) < 0)
8292 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008293
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 Py_XDECREF(exc);
8295 Py_XDECREF(errorHandler);
8296 return res;
8297
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 Py_XDECREF(res);
8300 Py_XDECREF(exc);
8301 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008302 return NULL;
8303}
8304
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008305/* Deprecated */
8306PyObject *
8307PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8308 Py_ssize_t size,
8309 PyObject *mapping,
8310 const char *errors)
8311{
8312 PyObject *result;
8313 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8314 if (unicode == NULL)
8315 return NULL;
8316 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8317 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008318 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008319}
8320
Alexander Belopolsky40018472011-02-26 01:02:56 +00008321PyObject *
8322PyUnicode_AsCharmapString(PyObject *unicode,
8323 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324{
8325 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 PyErr_BadArgument();
8327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008329 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330}
8331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333static void
8334make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008336 Py_ssize_t startpos, Py_ssize_t endpos,
8337 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008340 *exceptionObject = _PyUnicodeTranslateError_Create(
8341 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342 }
8343 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8345 goto onError;
8346 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8347 goto onError;
8348 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8349 goto onError;
8350 return;
8351 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008352 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 }
8354}
8355
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356/* error handling callback helper:
8357 build arguments, call the callback and check the arguments,
8358 put the result into newpos and return the replacement string, which
8359 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008360static PyObject *
8361unicode_translate_call_errorhandler(const char *errors,
8362 PyObject **errorHandler,
8363 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008365 Py_ssize_t startpos, Py_ssize_t endpos,
8366 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008368 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008370 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 PyObject *restuple;
8372 PyObject *resunicode;
8373
8374 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378 }
8379
8380 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384
8385 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008390 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 Py_DECREF(restuple);
8392 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393 }
8394 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 &resunicode, &i_newpos)) {
8396 Py_DECREF(restuple);
8397 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008399 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008401 else
8402 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008403 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008404 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 Py_DECREF(restuple);
8406 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008407 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 Py_INCREF(resunicode);
8409 Py_DECREF(restuple);
8410 return resunicode;
8411}
8412
8413/* Lookup the character ch in the mapping and put the result in result,
8414 which must be decrefed by the caller.
8415 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008416static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418{
Christian Heimes217cfd12007-12-02 14:31:20 +00008419 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420 PyObject *x;
8421
8422 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 x = PyObject_GetItem(mapping, w);
8425 Py_DECREF(w);
8426 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8428 /* No mapping found means: use 1:1 mapping. */
8429 PyErr_Clear();
8430 *result = NULL;
8431 return 0;
8432 } else
8433 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 }
8435 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 *result = x;
8437 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008439 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008441 if (value < 0 || value > MAX_UNICODE) {
8442 PyErr_Format(PyExc_ValueError,
8443 "character mapping must be in range(0x%x)",
8444 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 Py_DECREF(x);
8446 return -1;
8447 }
8448 *result = x;
8449 return 0;
8450 }
8451 else if (PyUnicode_Check(x)) {
8452 *result = x;
8453 return 0;
8454 }
8455 else {
8456 /* wrong return value */
8457 PyErr_SetString(PyExc_TypeError,
8458 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008459 Py_DECREF(x);
8460 return -1;
8461 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462}
Victor Stinner1194ea02014-04-04 19:37:40 +02008463
8464/* lookup the character, write the result into the writer.
8465 Return 1 if the result was written into the writer, return 0 if the mapping
8466 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008467static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008468charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8469 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470{
Victor Stinner1194ea02014-04-04 19:37:40 +02008471 PyObject *item;
8472
8473 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008475
8476 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008478 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008479 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008481 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008482 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008483
8484 if (item == Py_None) {
8485 Py_DECREF(item);
8486 return 0;
8487 }
8488
8489 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008490 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8491 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8492 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008493 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8494 Py_DECREF(item);
8495 return -1;
8496 }
8497 Py_DECREF(item);
8498 return 1;
8499 }
8500
8501 if (!PyUnicode_Check(item)) {
8502 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008504 }
8505
8506 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8507 Py_DECREF(item);
8508 return -1;
8509 }
8510
8511 Py_DECREF(item);
8512 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513}
8514
Victor Stinner89a76ab2014-04-05 11:44:04 +02008515static int
8516unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8517 Py_UCS1 *translate)
8518{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008519 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008520 int ret = 0;
8521
Victor Stinner89a76ab2014-04-05 11:44:04 +02008522 if (charmaptranslate_lookup(ch, mapping, &item)) {
8523 return -1;
8524 }
8525
8526 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008527 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008528 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008529 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008530 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008531 /* not found => default to 1:1 mapping */
8532 translate[ch] = ch;
8533 return 1;
8534 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008535 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008536 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008537 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8538 used it */
8539 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008540 /* invalid character or character outside ASCII:
8541 skip the fast translate */
8542 goto exit;
8543 }
8544 translate[ch] = (Py_UCS1)replace;
8545 }
8546 else if (PyUnicode_Check(item)) {
8547 Py_UCS4 replace;
8548
8549 if (PyUnicode_READY(item) == -1) {
8550 Py_DECREF(item);
8551 return -1;
8552 }
8553 if (PyUnicode_GET_LENGTH(item) != 1)
8554 goto exit;
8555
8556 replace = PyUnicode_READ_CHAR(item, 0);
8557 if (replace > 127)
8558 goto exit;
8559 translate[ch] = (Py_UCS1)replace;
8560 }
8561 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008562 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008563 goto exit;
8564 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008565 ret = 1;
8566
Benjamin Peterson1365de72014-04-07 20:15:41 -04008567 exit:
8568 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008569 return ret;
8570}
8571
8572/* Fast path for ascii => ascii translation. Return 1 if the whole string
8573 was translated into writer, return 0 if the input string was partially
8574 translated into writer, raise an exception and return -1 on error. */
8575static int
8576unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008577 _PyUnicodeWriter *writer, int ignore,
8578 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008579{
Victor Stinner872b2912014-04-05 14:27:07 +02008580 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008581 Py_ssize_t len;
8582 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008583 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008584
Victor Stinner89a76ab2014-04-05 11:44:04 +02008585 len = PyUnicode_GET_LENGTH(input);
8586
Victor Stinner872b2912014-04-05 14:27:07 +02008587 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008588
8589 in = PyUnicode_1BYTE_DATA(input);
8590 end = in + len;
8591
8592 assert(PyUnicode_IS_ASCII(writer->buffer));
8593 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8594 out = PyUnicode_1BYTE_DATA(writer->buffer);
8595
Victor Stinner872b2912014-04-05 14:27:07 +02008596 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008597 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008598 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008599 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008600 int translate = unicode_fast_translate_lookup(mapping, ch,
8601 ascii_table);
8602 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008603 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008604 if (translate == 0)
8605 goto exit;
8606 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008607 }
Victor Stinner872b2912014-04-05 14:27:07 +02008608 if (ch2 == 0xfe) {
8609 if (ignore)
8610 continue;
8611 goto exit;
8612 }
8613 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008614 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008615 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008616 }
Victor Stinner872b2912014-04-05 14:27:07 +02008617 res = 1;
8618
8619exit:
8620 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008621 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008622 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008623}
8624
Alexander Belopolsky40018472011-02-26 01:02:56 +00008625PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626_PyUnicode_TranslateCharmap(PyObject *input,
8627 PyObject *mapping,
8628 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008631 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 Py_ssize_t size, i;
8633 int kind;
8634 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008635 _PyUnicodeWriter writer;
8636 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 char *reason = "character maps to <undefined>";
8638 PyObject *errorHandler = NULL;
8639 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008640 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008641 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 PyErr_BadArgument();
8645 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 if (PyUnicode_READY(input) == -1)
8649 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008650 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 kind = PyUnicode_KIND(input);
8652 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653
8654 if (size == 0) {
8655 Py_INCREF(input);
8656 return input;
8657 }
8658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659 /* allocate enough for a simple 1:1 translation without
8660 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008661 _PyUnicodeWriter_Init(&writer);
8662 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664
Victor Stinner872b2912014-04-05 14:27:07 +02008665 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8666
Victor Stinner33798672016-03-01 21:59:58 +01008667 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008668 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008669 if (PyUnicode_IS_ASCII(input)) {
8670 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8671 if (res < 0) {
8672 _PyUnicodeWriter_Dealloc(&writer);
8673 return NULL;
8674 }
8675 if (res == 1)
8676 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008677 }
Victor Stinner33798672016-03-01 21:59:58 +01008678 else {
8679 i = 0;
8680 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008684 int translate;
8685 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8686 Py_ssize_t newpos;
8687 /* startpos for collecting untranslatable chars */
8688 Py_ssize_t collstart;
8689 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008690 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008691
Victor Stinner1194ea02014-04-04 19:37:40 +02008692 ch = PyUnicode_READ(kind, data, i);
8693 translate = charmaptranslate_output(ch, mapping, &writer);
8694 if (translate < 0)
8695 goto onError;
8696
8697 if (translate != 0) {
8698 /* it worked => adjust input pointer */
8699 ++i;
8700 continue;
8701 }
8702
8703 /* untranslatable character */
8704 collstart = i;
8705 collend = i+1;
8706
8707 /* find all untranslatable characters */
8708 while (collend < size) {
8709 PyObject *x;
8710 ch = PyUnicode_READ(kind, data, collend);
8711 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008712 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008713 Py_XDECREF(x);
8714 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008716 ++collend;
8717 }
8718
8719 if (ignore) {
8720 i = collend;
8721 }
8722 else {
8723 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8724 reason, input, &exc,
8725 collstart, collend, &newpos);
8726 if (repunicode == NULL)
8727 goto onError;
8728 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008730 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008731 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008732 Py_DECREF(repunicode);
8733 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008734 }
8735 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 Py_XDECREF(exc);
8737 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008738 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008741 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742 Py_XDECREF(exc);
8743 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744 return NULL;
8745}
8746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008747/* Deprecated. Use PyUnicode_Translate instead. */
8748PyObject *
8749PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8750 Py_ssize_t size,
8751 PyObject *mapping,
8752 const char *errors)
8753{
Christian Heimes5f520f42012-09-11 14:03:25 +02008754 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8756 if (!unicode)
8757 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008758 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8759 Py_DECREF(unicode);
8760 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008761}
8762
Alexander Belopolsky40018472011-02-26 01:02:56 +00008763PyObject *
8764PyUnicode_Translate(PyObject *str,
8765 PyObject *mapping,
8766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767{
8768 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008769
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770 str = PyUnicode_FromObject(str);
8771 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008772 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 Py_DECREF(str);
8775 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776}
Tim Petersced69f82003-09-16 20:30:58 +00008777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008779fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780{
8781 /* No need to call PyUnicode_READY(self) because this function is only
8782 called as a callback from fixup() which does it already. */
8783 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8784 const int kind = PyUnicode_KIND(self);
8785 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008786 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008787 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788 Py_ssize_t i;
8789
8790 for (i = 0; i < len; ++i) {
8791 ch = PyUnicode_READ(kind, data, i);
8792 fixed = 0;
8793 if (ch > 127) {
8794 if (Py_UNICODE_ISSPACE(ch))
8795 fixed = ' ';
8796 else {
8797 const int decimal = Py_UNICODE_TODECIMAL(ch);
8798 if (decimal >= 0)
8799 fixed = '0' + decimal;
8800 }
8801 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008802 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008803 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 PyUnicode_WRITE(kind, data, i, fixed);
8805 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008806 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008807 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 }
8810
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008811 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812}
8813
8814PyObject *
8815_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8816{
8817 if (!PyUnicode_Check(unicode)) {
8818 PyErr_BadInternalCall();
8819 return NULL;
8820 }
8821 if (PyUnicode_READY(unicode) == -1)
8822 return NULL;
8823 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8824 /* If the string is already ASCII, just return the same string */
8825 Py_INCREF(unicode);
8826 return unicode;
8827 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008828 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829}
8830
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008831PyObject *
8832PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8833 Py_ssize_t length)
8834{
Victor Stinnerf0124502011-11-21 23:12:56 +01008835 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008836 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008837 Py_UCS4 maxchar;
8838 enum PyUnicode_Kind kind;
8839 void *data;
8840
Victor Stinner99d7ad02012-02-22 13:37:39 +01008841 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008842 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008843 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008844 if (ch > 127) {
8845 int decimal = Py_UNICODE_TODECIMAL(ch);
8846 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008847 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008848 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008849 }
8850 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008851
8852 /* Copy to a new string */
8853 decimal = PyUnicode_New(length, maxchar);
8854 if (decimal == NULL)
8855 return decimal;
8856 kind = PyUnicode_KIND(decimal);
8857 data = PyUnicode_DATA(decimal);
8858 /* Iterate over code points */
8859 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008860 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01008861 if (ch > 127) {
8862 int decimal = Py_UNICODE_TODECIMAL(ch);
8863 if (decimal >= 0)
8864 ch = '0' + decimal;
8865 }
8866 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008868 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008869}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008870/* --- Decimal Encoder ---------------------------------------------------- */
8871
Alexander Belopolsky40018472011-02-26 01:02:56 +00008872int
8873PyUnicode_EncodeDecimal(Py_UNICODE *s,
8874 Py_ssize_t length,
8875 char *output,
8876 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008877{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008878 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008879 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008880 enum PyUnicode_Kind kind;
8881 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008882
8883 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 PyErr_BadArgument();
8885 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008886 }
8887
Victor Stinner42bf7752011-11-21 22:52:58 +01008888 unicode = PyUnicode_FromUnicode(s, length);
8889 if (unicode == NULL)
8890 return -1;
8891
Benjamin Petersonbac79492012-01-14 13:34:47 -05008892 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008893 Py_DECREF(unicode);
8894 return -1;
8895 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008896 kind = PyUnicode_KIND(unicode);
8897 data = PyUnicode_DATA(unicode);
8898
Victor Stinnerb84d7232011-11-22 01:50:07 +01008899 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008900 PyObject *exc;
8901 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008902 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008903 Py_ssize_t startpos;
8904
8905 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008906
Benjamin Peterson29060642009-01-31 22:14:21 +00008907 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008908 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008909 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008911 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 decimal = Py_UNICODE_TODECIMAL(ch);
8913 if (decimal >= 0) {
8914 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008915 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 continue;
8917 }
8918 if (0 < ch && ch < 256) {
8919 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008920 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 continue;
8922 }
Victor Stinner6345be92011-11-25 20:09:01 +01008923
Victor Stinner42bf7752011-11-21 22:52:58 +01008924 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008925 exc = NULL;
8926 raise_encode_exception(&exc, "decimal", unicode,
8927 startpos, startpos+1,
8928 "invalid decimal Unicode string");
8929 Py_XDECREF(exc);
8930 Py_DECREF(unicode);
8931 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008932 }
8933 /* 0-terminate the output string */
8934 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008935 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008936 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008937}
8938
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939/* --- Helpers ------------------------------------------------------------ */
8940
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008941/* helper macro to fixup start/end slice values */
8942#define ADJUST_INDICES(start, end, len) \
8943 if (end > len) \
8944 end = len; \
8945 else if (end < 0) { \
8946 end += len; \
8947 if (end < 0) \
8948 end = 0; \
8949 } \
8950 if (start < 0) { \
8951 start += len; \
8952 if (start < 0) \
8953 start = 0; \
8954 }
8955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008957any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 Py_ssize_t start,
8959 Py_ssize_t end)
8960{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008961 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 void *buf1, *buf2;
8963 Py_ssize_t len1, len2, result;
8964
8965 kind1 = PyUnicode_KIND(s1);
8966 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008967 if (kind1 < kind2)
8968 return -1;
8969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 len1 = PyUnicode_GET_LENGTH(s1);
8971 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008972 ADJUST_INDICES(start, end, len1);
8973 if (end - start < len2)
8974 return -1;
8975
8976 buf1 = PyUnicode_DATA(s1);
8977 buf2 = PyUnicode_DATA(s2);
8978 if (len2 == 1) {
8979 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
8980 result = findchar((const char *)buf1 + kind1*start,
8981 kind1, end - start, ch, direction);
8982 if (result == -1)
8983 return -1;
8984 else
8985 return start + result;
8986 }
8987
8988 if (kind2 != kind1) {
8989 buf2 = _PyUnicode_AsKind(s2, kind1);
8990 if (!buf2)
8991 return -2;
8992 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993
Victor Stinner794d5672011-10-10 03:21:36 +02008994 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008995 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02008996 case PyUnicode_1BYTE_KIND:
8997 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8998 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8999 else
9000 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9001 break;
9002 case PyUnicode_2BYTE_KIND:
9003 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9004 break;
9005 case PyUnicode_4BYTE_KIND:
9006 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9007 break;
9008 default:
9009 assert(0); result = -2;
9010 }
9011 }
9012 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009013 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009014 case PyUnicode_1BYTE_KIND:
9015 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9016 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9017 else
9018 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9019 break;
9020 case PyUnicode_2BYTE_KIND:
9021 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9022 break;
9023 case PyUnicode_4BYTE_KIND:
9024 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9025 break;
9026 default:
9027 assert(0); result = -2;
9028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009029 }
9030
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009031 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 PyMem_Free(buf2);
9033
9034 return result;
9035}
9036
9037Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009038_PyUnicode_InsertThousandsGrouping(
9039 PyObject *unicode, Py_ssize_t index,
9040 Py_ssize_t n_buffer,
9041 void *digits, Py_ssize_t n_digits,
9042 Py_ssize_t min_width,
9043 const char *grouping, PyObject *thousands_sep,
9044 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045{
Victor Stinner41a863c2012-02-24 00:37:51 +01009046 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009047 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009048 Py_ssize_t thousands_sep_len;
9049 Py_ssize_t len;
9050
9051 if (unicode != NULL) {
9052 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009053 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009054 }
9055 else {
9056 kind = PyUnicode_1BYTE_KIND;
9057 data = NULL;
9058 }
9059 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9060 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9061 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9062 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009063 if (thousands_sep_kind < kind) {
9064 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9065 if (!thousands_sep_data)
9066 return -1;
9067 }
9068 else {
9069 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9070 if (!data)
9071 return -1;
9072 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009073 }
9074
Benjamin Petersonead6b532011-12-20 17:23:42 -06009075 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009077 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009078 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009079 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009080 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009081 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009082 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009083 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009084 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009085 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009086 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009087 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009089 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009090 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009091 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009092 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009093 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009095 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009096 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009097 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009098 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009099 break;
9100 default:
9101 assert(0);
9102 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009104 if (unicode != NULL && thousands_sep_kind != kind) {
9105 if (thousands_sep_kind < kind)
9106 PyMem_Free(thousands_sep_data);
9107 else
9108 PyMem_Free(data);
9109 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009110 if (unicode == NULL) {
9111 *maxchar = 127;
9112 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009113 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009114 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009115 }
9116 }
9117 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118}
9119
9120
Alexander Belopolsky40018472011-02-26 01:02:56 +00009121Py_ssize_t
9122PyUnicode_Count(PyObject *str,
9123 PyObject *substr,
9124 Py_ssize_t start,
9125 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009126{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009127 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009128 PyObject* str_obj;
9129 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009130 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 void *buf1 = NULL, *buf2 = NULL;
9132 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009133
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009134 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009135 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009136 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009137 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009138 if (!sub_obj) {
9139 Py_DECREF(str_obj);
9140 return -1;
9141 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009142 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009143 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 Py_DECREF(str_obj);
9145 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 }
Tim Petersced69f82003-09-16 20:30:58 +00009147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 kind1 = PyUnicode_KIND(str_obj);
9149 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009150 if (kind1 < kind2) {
9151 Py_DECREF(sub_obj);
9152 Py_DECREF(str_obj);
9153 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009154 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 len1 = PyUnicode_GET_LENGTH(str_obj);
9157 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009159 if (end - start < len2) {
9160 Py_DECREF(sub_obj);
9161 Py_DECREF(str_obj);
9162 return 0;
9163 }
9164
9165 buf1 = PyUnicode_DATA(str_obj);
9166 buf2 = PyUnicode_DATA(sub_obj);
9167 if (kind2 != kind1) {
9168 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9169 if (!buf2)
9170 goto onError;
9171 }
9172
9173 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009175 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9176 result = asciilib_count(
9177 ((Py_UCS1*)buf1) + start, end - start,
9178 buf2, len2, PY_SSIZE_T_MAX
9179 );
9180 else
9181 result = ucs1lib_count(
9182 ((Py_UCS1*)buf1) + start, end - start,
9183 buf2, len2, PY_SSIZE_T_MAX
9184 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 break;
9186 case PyUnicode_2BYTE_KIND:
9187 result = ucs2lib_count(
9188 ((Py_UCS2*)buf1) + start, end - start,
9189 buf2, len2, PY_SSIZE_T_MAX
9190 );
9191 break;
9192 case PyUnicode_4BYTE_KIND:
9193 result = ucs4lib_count(
9194 ((Py_UCS4*)buf1) + start, end - start,
9195 buf2, len2, PY_SSIZE_T_MAX
9196 );
9197 break;
9198 default:
9199 assert(0); result = 0;
9200 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009201
9202 Py_DECREF(sub_obj);
9203 Py_DECREF(str_obj);
9204
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009205 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 PyMem_Free(buf2);
9207
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209 onError:
9210 Py_DECREF(sub_obj);
9211 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009212 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 PyMem_Free(buf2);
9214 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215}
9216
Alexander Belopolsky40018472011-02-26 01:02:56 +00009217Py_ssize_t
9218PyUnicode_Find(PyObject *str,
9219 PyObject *sub,
9220 Py_ssize_t start,
9221 Py_ssize_t end,
9222 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009224 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009225
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009227 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009228 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009229 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009230 if (!sub) {
9231 Py_DECREF(str);
9232 return -2;
9233 }
9234 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9235 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009236 Py_DECREF(str);
9237 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 }
Tim Petersced69f82003-09-16 20:30:58 +00009239
Victor Stinner794d5672011-10-10 03:21:36 +02009240 result = any_find_slice(direction,
9241 str, sub, start, end
9242 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009243
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009245 Py_DECREF(sub);
9246
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247 return result;
9248}
9249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250Py_ssize_t
9251PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9252 Py_ssize_t start, Py_ssize_t end,
9253 int direction)
9254{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009255 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009256 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 if (PyUnicode_READY(str) == -1)
9258 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009259 if (start < 0 || end < 0) {
9260 PyErr_SetString(PyExc_IndexError, "string index out of range");
9261 return -2;
9262 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 if (end > PyUnicode_GET_LENGTH(str))
9264 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009265 if (start >= end)
9266 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009268 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9269 kind, end-start, ch, direction);
9270 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009272 else
9273 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274}
9275
Alexander Belopolsky40018472011-02-26 01:02:56 +00009276static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009277tailmatch(PyObject *self,
9278 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009279 Py_ssize_t start,
9280 Py_ssize_t end,
9281 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 int kind_self;
9284 int kind_sub;
9285 void *data_self;
9286 void *data_sub;
9287 Py_ssize_t offset;
9288 Py_ssize_t i;
9289 Py_ssize_t end_sub;
9290
9291 if (PyUnicode_READY(self) == -1 ||
9292 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009293 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9296 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009298 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009299
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009300 if (PyUnicode_GET_LENGTH(substring) == 0)
9301 return 1;
9302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009303 kind_self = PyUnicode_KIND(self);
9304 data_self = PyUnicode_DATA(self);
9305 kind_sub = PyUnicode_KIND(substring);
9306 data_sub = PyUnicode_DATA(substring);
9307 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9308
9309 if (direction > 0)
9310 offset = end;
9311 else
9312 offset = start;
9313
9314 if (PyUnicode_READ(kind_self, data_self, offset) ==
9315 PyUnicode_READ(kind_sub, data_sub, 0) &&
9316 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9317 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9318 /* If both are of the same kind, memcmp is sufficient */
9319 if (kind_self == kind_sub) {
9320 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009321 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322 data_sub,
9323 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009324 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 }
9326 /* otherwise we have to compare each character by first accesing it */
9327 else {
9328 /* We do not need to compare 0 and len(substring)-1 because
9329 the if statement above ensured already that they are equal
9330 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 for (i = 1; i < end_sub; ++i) {
9332 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9333 PyUnicode_READ(kind_sub, data_sub, i))
9334 return 0;
9335 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009336 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009338 }
9339
9340 return 0;
9341}
9342
Alexander Belopolsky40018472011-02-26 01:02:56 +00009343Py_ssize_t
9344PyUnicode_Tailmatch(PyObject *str,
9345 PyObject *substr,
9346 Py_ssize_t start,
9347 Py_ssize_t end,
9348 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009350 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009351
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 str = PyUnicode_FromObject(str);
9353 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009354 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355 substr = PyUnicode_FromObject(substr);
9356 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009357 Py_DECREF(str);
9358 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359 }
Tim Petersced69f82003-09-16 20:30:58 +00009360
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009361 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009362 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363 Py_DECREF(str);
9364 Py_DECREF(substr);
9365 return result;
9366}
9367
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368/* Apply fixfct filter to the Unicode object self and return a
9369 reference to the modified object */
9370
Alexander Belopolsky40018472011-02-26 01:02:56 +00009371static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009372fixup(PyObject *self,
9373 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 PyObject *u;
9376 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009377 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009379 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009380 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009381 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009382 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 /* fix functions return the new maximum character in a string,
9385 if the kind of the resulting unicode object does not change,
9386 everything is fine. Otherwise we need to change the string kind
9387 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009388 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009389
9390 if (maxchar_new == 0) {
9391 /* no changes */;
9392 if (PyUnicode_CheckExact(self)) {
9393 Py_DECREF(u);
9394 Py_INCREF(self);
9395 return self;
9396 }
9397 else
9398 return u;
9399 }
9400
Victor Stinnere6abb482012-05-02 01:15:40 +02009401 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402
Victor Stinnereaab6042011-12-11 22:22:39 +01009403 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009405
9406 /* In case the maximum character changed, we need to
9407 convert the string to the new category. */
9408 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9409 if (v == NULL) {
9410 Py_DECREF(u);
9411 return NULL;
9412 }
9413 if (maxchar_new > maxchar_old) {
9414 /* If the maxchar increased so that the kind changed, not all
9415 characters are representable anymore and we need to fix the
9416 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009417 _PyUnicode_FastCopyCharacters(v, 0,
9418 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009419 maxchar_old = fixfct(v);
9420 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 }
9422 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009423 _PyUnicode_FastCopyCharacters(v, 0,
9424 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009426 Py_DECREF(u);
9427 assert(_PyUnicode_CheckConsistency(v, 1));
9428 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009429}
9430
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009431static PyObject *
9432ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009433{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009434 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9435 char *resdata, *data = PyUnicode_DATA(self);
9436 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009437
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009438 res = PyUnicode_New(len, 127);
9439 if (res == NULL)
9440 return NULL;
9441 resdata = PyUnicode_DATA(res);
9442 if (lower)
9443 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009445 _Py_bytes_upper(resdata, data, len);
9446 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447}
9448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009450handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009452 Py_ssize_t j;
9453 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009454 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009455 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009456
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009457 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9458
9459 where ! is a negation and \p{xxx} is a character with property xxx.
9460 */
9461 for (j = i - 1; j >= 0; j--) {
9462 c = PyUnicode_READ(kind, data, j);
9463 if (!_PyUnicode_IsCaseIgnorable(c))
9464 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009466 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9467 if (final_sigma) {
9468 for (j = i + 1; j < length; j++) {
9469 c = PyUnicode_READ(kind, data, j);
9470 if (!_PyUnicode_IsCaseIgnorable(c))
9471 break;
9472 }
9473 final_sigma = j == length || !_PyUnicode_IsCased(c);
9474 }
9475 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476}
9477
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009478static int
9479lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9480 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009482 /* Obscure special case. */
9483 if (c == 0x3A3) {
9484 mapped[0] = handle_capital_sigma(kind, data, length, i);
9485 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009487 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488}
9489
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009490static Py_ssize_t
9491do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009493 Py_ssize_t i, k = 0;
9494 int n_res, j;
9495 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009496
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009497 c = PyUnicode_READ(kind, data, 0);
9498 n_res = _PyUnicode_ToUpperFull(c, mapped);
9499 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009500 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009501 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009503 for (i = 1; i < length; i++) {
9504 c = PyUnicode_READ(kind, data, i);
9505 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9506 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009507 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009508 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009509 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009510 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009511 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512}
9513
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009514static Py_ssize_t
9515do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9516 Py_ssize_t i, k = 0;
9517
9518 for (i = 0; i < length; i++) {
9519 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9520 int n_res, j;
9521 if (Py_UNICODE_ISUPPER(c)) {
9522 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9523 }
9524 else if (Py_UNICODE_ISLOWER(c)) {
9525 n_res = _PyUnicode_ToUpperFull(c, mapped);
9526 }
9527 else {
9528 n_res = 1;
9529 mapped[0] = c;
9530 }
9531 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009532 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009533 res[k++] = mapped[j];
9534 }
9535 }
9536 return k;
9537}
9538
9539static Py_ssize_t
9540do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9541 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009543 Py_ssize_t i, k = 0;
9544
9545 for (i = 0; i < length; i++) {
9546 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9547 int n_res, j;
9548 if (lower)
9549 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9550 else
9551 n_res = _PyUnicode_ToUpperFull(c, mapped);
9552 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009553 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009554 res[k++] = mapped[j];
9555 }
9556 }
9557 return k;
9558}
9559
9560static Py_ssize_t
9561do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9562{
9563 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9564}
9565
9566static Py_ssize_t
9567do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9568{
9569 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9570}
9571
Benjamin Petersone51757f2012-01-12 21:10:29 -05009572static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009573do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9574{
9575 Py_ssize_t i, k = 0;
9576
9577 for (i = 0; i < length; i++) {
9578 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9579 Py_UCS4 mapped[3];
9580 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9581 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009582 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009583 res[k++] = mapped[j];
9584 }
9585 }
9586 return k;
9587}
9588
9589static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009590do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9591{
9592 Py_ssize_t i, k = 0;
9593 int previous_is_cased;
9594
9595 previous_is_cased = 0;
9596 for (i = 0; i < length; i++) {
9597 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9598 Py_UCS4 mapped[3];
9599 int n_res, j;
9600
9601 if (previous_is_cased)
9602 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9603 else
9604 n_res = _PyUnicode_ToTitleFull(c, mapped);
9605
9606 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009607 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009608 res[k++] = mapped[j];
9609 }
9610
9611 previous_is_cased = _PyUnicode_IsCased(c);
9612 }
9613 return k;
9614}
9615
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009616static PyObject *
9617case_operation(PyObject *self,
9618 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9619{
9620 PyObject *res = NULL;
9621 Py_ssize_t length, newlength = 0;
9622 int kind, outkind;
9623 void *data, *outdata;
9624 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9625
Benjamin Petersoneea48462012-01-16 14:28:50 -05009626 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009627
9628 kind = PyUnicode_KIND(self);
9629 data = PyUnicode_DATA(self);
9630 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009631 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009632 PyErr_SetString(PyExc_OverflowError, "string is too long");
9633 return NULL;
9634 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009635 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009636 if (tmp == NULL)
9637 return PyErr_NoMemory();
9638 newlength = perform(kind, data, length, tmp, &maxchar);
9639 res = PyUnicode_New(newlength, maxchar);
9640 if (res == NULL)
9641 goto leave;
9642 tmpend = tmp + newlength;
9643 outdata = PyUnicode_DATA(res);
9644 outkind = PyUnicode_KIND(res);
9645 switch (outkind) {
9646 case PyUnicode_1BYTE_KIND:
9647 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9648 break;
9649 case PyUnicode_2BYTE_KIND:
9650 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9651 break;
9652 case PyUnicode_4BYTE_KIND:
9653 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9654 break;
9655 default:
9656 assert(0);
9657 break;
9658 }
9659 leave:
9660 PyMem_FREE(tmp);
9661 return res;
9662}
9663
Tim Peters8ce9f162004-08-27 01:49:32 +00009664PyObject *
9665PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009668 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009669 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009670 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009671 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9672 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009673 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009675 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009677 int use_memcpy;
9678 unsigned char *res_data = NULL, *sep_data = NULL;
9679 PyObject *last_obj;
9680 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009682 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009683 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009684 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009685 }
9686
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009687 /* NOTE: the following code can't call back into Python code,
9688 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009689 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009690
Tim Peters05eba1f2004-08-27 21:32:02 +00009691 seqlen = PySequence_Fast_GET_SIZE(fseq);
9692 /* If empty sequence, return u"". */
9693 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009694 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009695 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009696 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009697
Tim Peters05eba1f2004-08-27 21:32:02 +00009698 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009699 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009700 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009701 if (seqlen == 1) {
9702 if (PyUnicode_CheckExact(items[0])) {
9703 res = items[0];
9704 Py_INCREF(res);
9705 Py_DECREF(fseq);
9706 return res;
9707 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009708 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009709 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009710 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009711 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009712 /* Set up sep and seplen */
9713 if (separator == NULL) {
9714 /* fall back to a blank space separator */
9715 sep = PyUnicode_FromOrdinal(' ');
9716 if (!sep)
9717 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009718 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009719 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009720 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009721 else {
9722 if (!PyUnicode_Check(separator)) {
9723 PyErr_Format(PyExc_TypeError,
9724 "separator: expected str instance,"
9725 " %.80s found",
9726 Py_TYPE(separator)->tp_name);
9727 goto onError;
9728 }
9729 if (PyUnicode_READY(separator))
9730 goto onError;
9731 sep = separator;
9732 seplen = PyUnicode_GET_LENGTH(separator);
9733 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9734 /* inc refcount to keep this code path symmetric with the
9735 above case of a blank separator */
9736 Py_INCREF(sep);
9737 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009738 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009739 }
9740
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009741 /* There are at least two things to join, or else we have a subclass
9742 * of str in the sequence.
9743 * Do a pre-pass to figure out the total amount of space we'll
9744 * need (sz), and see whether all argument are strings.
9745 */
9746 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009747#ifdef Py_DEBUG
9748 use_memcpy = 0;
9749#else
9750 use_memcpy = 1;
9751#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009752 for (i = 0; i < seqlen; i++) {
9753 const Py_ssize_t old_sz = sz;
9754 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009755 if (!PyUnicode_Check(item)) {
9756 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009757 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009758 " %.80s found",
9759 i, Py_TYPE(item)->tp_name);
9760 goto onError;
9761 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009762 if (PyUnicode_READY(item) == -1)
9763 goto onError;
9764 sz += PyUnicode_GET_LENGTH(item);
9765 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009766 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009767 if (i != 0)
9768 sz += seplen;
9769 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9770 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009771 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009772 goto onError;
9773 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009774 if (use_memcpy && last_obj != NULL) {
9775 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9776 use_memcpy = 0;
9777 }
9778 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009779 }
Tim Petersced69f82003-09-16 20:30:58 +00009780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009782 if (res == NULL)
9783 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009784
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009785 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009786#ifdef Py_DEBUG
9787 use_memcpy = 0;
9788#else
9789 if (use_memcpy) {
9790 res_data = PyUnicode_1BYTE_DATA(res);
9791 kind = PyUnicode_KIND(res);
9792 if (seplen != 0)
9793 sep_data = PyUnicode_1BYTE_DATA(sep);
9794 }
9795#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009796 if (use_memcpy) {
9797 for (i = 0; i < seqlen; ++i) {
9798 Py_ssize_t itemlen;
9799 item = items[i];
9800
9801 /* Copy item, and maybe the separator. */
9802 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009803 Py_MEMCPY(res_data,
9804 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009805 kind * seplen);
9806 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009807 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009808
9809 itemlen = PyUnicode_GET_LENGTH(item);
9810 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009811 Py_MEMCPY(res_data,
9812 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009813 kind * itemlen);
9814 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009815 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009816 }
9817 assert(res_data == PyUnicode_1BYTE_DATA(res)
9818 + kind * PyUnicode_GET_LENGTH(res));
9819 }
9820 else {
9821 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9822 Py_ssize_t itemlen;
9823 item = items[i];
9824
9825 /* Copy item, and maybe the separator. */
9826 if (i && seplen != 0) {
9827 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9828 res_offset += seplen;
9829 }
9830
9831 itemlen = PyUnicode_GET_LENGTH(item);
9832 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009833 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009834 res_offset += itemlen;
9835 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009836 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009837 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009838 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009839
Tim Peters05eba1f2004-08-27 21:32:02 +00009840 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009842 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009844
Benjamin Peterson29060642009-01-31 22:14:21 +00009845 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009846 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009848 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009849 return NULL;
9850}
9851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852#define FILL(kind, data, value, start, length) \
9853 do { \
9854 Py_ssize_t i_ = 0; \
9855 assert(kind != PyUnicode_WCHAR_KIND); \
9856 switch ((kind)) { \
9857 case PyUnicode_1BYTE_KIND: { \
9858 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009859 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 break; \
9861 } \
9862 case PyUnicode_2BYTE_KIND: { \
9863 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9864 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9865 break; \
9866 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009867 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9869 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9870 break; \
9871 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009872 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 } \
9874 } while (0)
9875
Victor Stinnerd3f08822012-05-29 12:57:52 +02009876void
9877_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9878 Py_UCS4 fill_char)
9879{
9880 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9881 const void *data = PyUnicode_DATA(unicode);
9882 assert(PyUnicode_IS_READY(unicode));
9883 assert(unicode_modifiable(unicode));
9884 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9885 assert(start >= 0);
9886 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9887 FILL(kind, data, fill_char, start, length);
9888}
9889
Victor Stinner3fe55312012-01-04 00:33:50 +01009890Py_ssize_t
9891PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9892 Py_UCS4 fill_char)
9893{
9894 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009895
9896 if (!PyUnicode_Check(unicode)) {
9897 PyErr_BadInternalCall();
9898 return -1;
9899 }
9900 if (PyUnicode_READY(unicode) == -1)
9901 return -1;
9902 if (unicode_check_modifiable(unicode))
9903 return -1;
9904
Victor Stinnerd3f08822012-05-29 12:57:52 +02009905 if (start < 0) {
9906 PyErr_SetString(PyExc_IndexError, "string index out of range");
9907 return -1;
9908 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009909 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9910 PyErr_SetString(PyExc_ValueError,
9911 "fill character is bigger than "
9912 "the string maximum character");
9913 return -1;
9914 }
9915
9916 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9917 length = Py_MIN(maxlen, length);
9918 if (length <= 0)
9919 return 0;
9920
Victor Stinnerd3f08822012-05-29 12:57:52 +02009921 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009922 return length;
9923}
9924
Victor Stinner9310abb2011-10-05 00:59:23 +02009925static PyObject *
9926pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009927 Py_ssize_t left,
9928 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 PyObject *u;
9932 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009933 int kind;
9934 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935
9936 if (left < 0)
9937 left = 0;
9938 if (right < 0)
9939 right = 0;
9940
Victor Stinnerc4b49542011-12-11 22:44:26 +01009941 if (left == 0 && right == 0)
9942 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009943
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9945 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009946 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9947 return NULL;
9948 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009950 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009952 if (!u)
9953 return NULL;
9954
9955 kind = PyUnicode_KIND(u);
9956 data = PyUnicode_DATA(u);
9957 if (left)
9958 FILL(kind, data, fill, 0, left);
9959 if (right)
9960 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009961 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009962 assert(_PyUnicode_CheckConsistency(u, 1));
9963 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964}
9965
Alexander Belopolsky40018472011-02-26 01:02:56 +00009966PyObject *
9967PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970
9971 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009972 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009973 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009974 if (PyUnicode_READY(string) == -1) {
9975 Py_DECREF(string);
9976 return NULL;
9977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978
Benjamin Petersonead6b532011-12-20 17:23:42 -06009979 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009981 if (PyUnicode_IS_ASCII(string))
9982 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009983 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009984 PyUnicode_GET_LENGTH(string), keepends);
9985 else
9986 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009987 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009988 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 break;
9990 case PyUnicode_2BYTE_KIND:
9991 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009992 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 PyUnicode_GET_LENGTH(string), keepends);
9994 break;
9995 case PyUnicode_4BYTE_KIND:
9996 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009997 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 PyUnicode_GET_LENGTH(string), keepends);
9999 break;
10000 default:
10001 assert(0);
10002 list = 0;
10003 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004 Py_DECREF(string);
10005 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006}
10007
Alexander Belopolsky40018472011-02-26 01:02:56 +000010008static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010009split(PyObject *self,
10010 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010011 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010013 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 void *buf1, *buf2;
10015 Py_ssize_t len1, len2;
10016 PyObject* out;
10017
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010019 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 if (PyUnicode_READY(self) == -1)
10022 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010025 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010027 if (PyUnicode_IS_ASCII(self))
10028 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010029 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010030 PyUnicode_GET_LENGTH(self), maxcount
10031 );
10032 else
10033 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010034 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010035 PyUnicode_GET_LENGTH(self), maxcount
10036 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 case PyUnicode_2BYTE_KIND:
10038 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010039 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 PyUnicode_GET_LENGTH(self), maxcount
10041 );
10042 case PyUnicode_4BYTE_KIND:
10043 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010044 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 PyUnicode_GET_LENGTH(self), maxcount
10046 );
10047 default:
10048 assert(0);
10049 return NULL;
10050 }
10051
10052 if (PyUnicode_READY(substring) == -1)
10053 return NULL;
10054
10055 kind1 = PyUnicode_KIND(self);
10056 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 len1 = PyUnicode_GET_LENGTH(self);
10058 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010059 if (kind1 < kind2 || len1 < len2) {
10060 out = PyList_New(1);
10061 if (out == NULL)
10062 return NULL;
10063 Py_INCREF(self);
10064 PyList_SET_ITEM(out, 0, self);
10065 return out;
10066 }
10067 buf1 = PyUnicode_DATA(self);
10068 buf2 = PyUnicode_DATA(substring);
10069 if (kind2 != kind1) {
10070 buf2 = _PyUnicode_AsKind(substring, kind1);
10071 if (!buf2)
10072 return NULL;
10073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010074
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010075 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010077 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10078 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010079 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010080 else
10081 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010082 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 break;
10084 case PyUnicode_2BYTE_KIND:
10085 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010086 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 break;
10088 case PyUnicode_4BYTE_KIND:
10089 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010090 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 break;
10092 default:
10093 out = NULL;
10094 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010095 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 PyMem_Free(buf2);
10097 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010098}
10099
Alexander Belopolsky40018472011-02-26 01:02:56 +000010100static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010101rsplit(PyObject *self,
10102 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010103 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010104{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010105 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 void *buf1, *buf2;
10107 Py_ssize_t len1, len2;
10108 PyObject* out;
10109
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010110 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010111 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 if (PyUnicode_READY(self) == -1)
10114 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010117 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010119 if (PyUnicode_IS_ASCII(self))
10120 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010121 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010122 PyUnicode_GET_LENGTH(self), maxcount
10123 );
10124 else
10125 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010126 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010127 PyUnicode_GET_LENGTH(self), maxcount
10128 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 case PyUnicode_2BYTE_KIND:
10130 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010131 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 PyUnicode_GET_LENGTH(self), maxcount
10133 );
10134 case PyUnicode_4BYTE_KIND:
10135 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010136 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 PyUnicode_GET_LENGTH(self), maxcount
10138 );
10139 default:
10140 assert(0);
10141 return NULL;
10142 }
10143
10144 if (PyUnicode_READY(substring) == -1)
10145 return NULL;
10146
10147 kind1 = PyUnicode_KIND(self);
10148 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 len1 = PyUnicode_GET_LENGTH(self);
10150 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010151 if (kind1 < kind2 || len1 < len2) {
10152 out = PyList_New(1);
10153 if (out == NULL)
10154 return NULL;
10155 Py_INCREF(self);
10156 PyList_SET_ITEM(out, 0, self);
10157 return out;
10158 }
10159 buf1 = PyUnicode_DATA(self);
10160 buf2 = PyUnicode_DATA(substring);
10161 if (kind2 != kind1) {
10162 buf2 = _PyUnicode_AsKind(substring, kind1);
10163 if (!buf2)
10164 return NULL;
10165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010167 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010169 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10170 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010171 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010172 else
10173 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010174 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 break;
10176 case PyUnicode_2BYTE_KIND:
10177 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010178 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 break;
10180 case PyUnicode_4BYTE_KIND:
10181 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010182 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 break;
10184 default:
10185 out = NULL;
10186 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010187 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 PyMem_Free(buf2);
10189 return out;
10190}
10191
10192static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010193anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10194 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010196 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010198 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10199 return asciilib_find(buf1, len1, buf2, len2, offset);
10200 else
10201 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 case PyUnicode_2BYTE_KIND:
10203 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10204 case PyUnicode_4BYTE_KIND:
10205 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10206 }
10207 assert(0);
10208 return -1;
10209}
10210
10211static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010212anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10213 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010215 switch (kind) {
10216 case PyUnicode_1BYTE_KIND:
10217 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10218 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10219 else
10220 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10221 case PyUnicode_2BYTE_KIND:
10222 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10223 case PyUnicode_4BYTE_KIND:
10224 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10225 }
10226 assert(0);
10227 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010228}
10229
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010230static void
10231replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10232 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10233{
10234 int kind = PyUnicode_KIND(u);
10235 void *data = PyUnicode_DATA(u);
10236 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10237 if (kind == PyUnicode_1BYTE_KIND) {
10238 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10239 (Py_UCS1 *)data + len,
10240 u1, u2, maxcount);
10241 }
10242 else if (kind == PyUnicode_2BYTE_KIND) {
10243 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10244 (Py_UCS2 *)data + len,
10245 u1, u2, maxcount);
10246 }
10247 else {
10248 assert(kind == PyUnicode_4BYTE_KIND);
10249 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10250 (Py_UCS4 *)data + len,
10251 u1, u2, maxcount);
10252 }
10253}
10254
Alexander Belopolsky40018472011-02-26 01:02:56 +000010255static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256replace(PyObject *self, PyObject *str1,
10257 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 PyObject *u;
10260 char *sbuf = PyUnicode_DATA(self);
10261 char *buf1 = PyUnicode_DATA(str1);
10262 char *buf2 = PyUnicode_DATA(str2);
10263 int srelease = 0, release1 = 0, release2 = 0;
10264 int skind = PyUnicode_KIND(self);
10265 int kind1 = PyUnicode_KIND(str1);
10266 int kind2 = PyUnicode_KIND(str2);
10267 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10268 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10269 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010270 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010271 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272
10273 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010274 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010276 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277
Victor Stinner59de0ee2011-10-07 10:01:28 +020010278 if (str1 == str2)
10279 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280
Victor Stinner49a0a212011-10-12 23:46:10 +020010281 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010282 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10283 if (maxchar < maxchar_str1)
10284 /* substring too wide to be present */
10285 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010286 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10287 /* Replacing str1 with str2 may cause a maxchar reduction in the
10288 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010289 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010290 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010293 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010295 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010297 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010298 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010299 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010300
Victor Stinner69ed0f42013-04-09 21:48:24 +020010301 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010302 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010303 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010304 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010305 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010307 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010309
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010310 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10311 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010312 }
10313 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 int rkind = skind;
10315 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010316 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 if (kind1 < rkind) {
10319 /* widen substring */
10320 buf1 = _PyUnicode_AsKind(str1, rkind);
10321 if (!buf1) goto error;
10322 release1 = 1;
10323 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010324 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010325 if (i < 0)
10326 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 if (rkind > kind2) {
10328 /* widen replacement */
10329 buf2 = _PyUnicode_AsKind(str2, rkind);
10330 if (!buf2) goto error;
10331 release2 = 1;
10332 }
10333 else if (rkind < kind2) {
10334 /* widen self and buf1 */
10335 rkind = kind2;
10336 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010337 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 sbuf = _PyUnicode_AsKind(self, rkind);
10339 if (!sbuf) goto error;
10340 srelease = 1;
10341 buf1 = _PyUnicode_AsKind(str1, rkind);
10342 if (!buf1) goto error;
10343 release1 = 1;
10344 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010345 u = PyUnicode_New(slen, maxchar);
10346 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010348 assert(PyUnicode_KIND(u) == rkind);
10349 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010350
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010351 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010352 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010353 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010355 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010357
10358 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010359 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010360 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010361 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010362 if (i == -1)
10363 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010364 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010366 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010370 }
10371 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010373 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 int rkind = skind;
10375 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010378 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 buf1 = _PyUnicode_AsKind(str1, rkind);
10380 if (!buf1) goto error;
10381 release1 = 1;
10382 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010383 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010384 if (n == 0)
10385 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010387 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 buf2 = _PyUnicode_AsKind(str2, rkind);
10389 if (!buf2) goto error;
10390 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010393 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 rkind = kind2;
10395 sbuf = _PyUnicode_AsKind(self, rkind);
10396 if (!sbuf) goto error;
10397 srelease = 1;
10398 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010399 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 buf1 = _PyUnicode_AsKind(str1, rkind);
10401 if (!buf1) goto error;
10402 release1 = 1;
10403 }
10404 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10405 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010406 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 PyErr_SetString(PyExc_OverflowError,
10408 "replace string is too long");
10409 goto error;
10410 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010411 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010412 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010413 _Py_INCREF_UNICODE_EMPTY();
10414 if (!unicode_empty)
10415 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010416 u = unicode_empty;
10417 goto done;
10418 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010419 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 PyErr_SetString(PyExc_OverflowError,
10421 "replace string is too long");
10422 goto error;
10423 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010424 u = PyUnicode_New(new_size, maxchar);
10425 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010427 assert(PyUnicode_KIND(u) == rkind);
10428 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 ires = i = 0;
10430 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010431 while (n-- > 0) {
10432 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010433 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010434 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010435 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010436 if (j == -1)
10437 break;
10438 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010439 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010440 memcpy(res + rkind * ires,
10441 sbuf + rkind * i,
10442 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010444 }
10445 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010447 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010449 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010455 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010456 memcpy(res + rkind * ires,
10457 sbuf + rkind * i,
10458 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010459 }
10460 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 /* interleave */
10462 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010463 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010465 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010467 if (--n <= 0)
10468 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010469 memcpy(res + rkind * ires,
10470 sbuf + rkind * i,
10471 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 ires++;
10473 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010474 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010475 memcpy(res + rkind * ires,
10476 sbuf + rkind * i,
10477 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010478 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010479 }
10480
10481 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010482 unicode_adjust_maxchar(&u);
10483 if (u == NULL)
10484 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010486
10487 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 if (srelease)
10489 PyMem_FREE(sbuf);
10490 if (release1)
10491 PyMem_FREE(buf1);
10492 if (release2)
10493 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010494 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010496
Benjamin Peterson29060642009-01-31 22:14:21 +000010497 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010498 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 if (srelease)
10500 PyMem_FREE(sbuf);
10501 if (release1)
10502 PyMem_FREE(buf1);
10503 if (release2)
10504 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010505 return unicode_result_unchanged(self);
10506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 error:
10508 if (srelease && sbuf)
10509 PyMem_FREE(sbuf);
10510 if (release1 && buf1)
10511 PyMem_FREE(buf1);
10512 if (release2 && buf2)
10513 PyMem_FREE(buf2);
10514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515}
10516
10517/* --- Unicode Object Methods --------------------------------------------- */
10518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010519PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010520 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521\n\
10522Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010523characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524
10525static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010526unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010527{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010528 if (PyUnicode_READY(self) == -1)
10529 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010530 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531}
10532
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010533PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010534 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535\n\
10536Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010537have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538
10539static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010540unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010542 if (PyUnicode_READY(self) == -1)
10543 return NULL;
10544 if (PyUnicode_GET_LENGTH(self) == 0)
10545 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010546 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547}
10548
Benjamin Petersond5890c82012-01-14 13:23:30 -050010549PyDoc_STRVAR(casefold__doc__,
10550 "S.casefold() -> str\n\
10551\n\
10552Return a version of S suitable for caseless comparisons.");
10553
10554static PyObject *
10555unicode_casefold(PyObject *self)
10556{
10557 if (PyUnicode_READY(self) == -1)
10558 return NULL;
10559 if (PyUnicode_IS_ASCII(self))
10560 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010561 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010562}
10563
10564
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010565/* Argument converter. Coerces to a single unicode character */
10566
10567static int
10568convert_uc(PyObject *obj, void *addr)
10569{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010571 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010572
Benjamin Peterson14339b62009-01-31 16:36:08 +000010573 uniobj = PyUnicode_FromObject(obj);
10574 if (uniobj == NULL) {
10575 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010577 return 0;
10578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010580 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010581 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010582 Py_DECREF(uniobj);
10583 return 0;
10584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010586 Py_DECREF(uniobj);
10587 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010588}
10589
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010590PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010591 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010593Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010594done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595
10596static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010597unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010599 Py_ssize_t marg, left;
10600 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 Py_UCS4 fillchar = ' ';
10602
Victor Stinnere9a29352011-10-01 02:14:59 +020010603 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605
Benjamin Petersonbac79492012-01-14 13:34:47 -050010606 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607 return NULL;
10608
Victor Stinnerc4b49542011-12-11 22:44:26 +010010609 if (PyUnicode_GET_LENGTH(self) >= width)
10610 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
Victor Stinnerc4b49542011-12-11 22:44:26 +010010612 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613 left = marg / 2 + (marg & width & 1);
10614
Victor Stinner9310abb2011-10-05 00:59:23 +020010615 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616}
10617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618/* This function assumes that str1 and str2 are readied by the caller. */
10619
Marc-André Lemburge5034372000-08-08 08:04:29 +000010620static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010621unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010622{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010623#define COMPARE(TYPE1, TYPE2) \
10624 do { \
10625 TYPE1* p1 = (TYPE1 *)data1; \
10626 TYPE2* p2 = (TYPE2 *)data2; \
10627 TYPE1* end = p1 + len; \
10628 Py_UCS4 c1, c2; \
10629 for (; p1 != end; p1++, p2++) { \
10630 c1 = *p1; \
10631 c2 = *p2; \
10632 if (c1 != c2) \
10633 return (c1 < c2) ? -1 : 1; \
10634 } \
10635 } \
10636 while (0)
10637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 int kind1, kind2;
10639 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010640 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 kind1 = PyUnicode_KIND(str1);
10643 kind2 = PyUnicode_KIND(str2);
10644 data1 = PyUnicode_DATA(str1);
10645 data2 = PyUnicode_DATA(str2);
10646 len1 = PyUnicode_GET_LENGTH(str1);
10647 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010648 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010649
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010650 switch(kind1) {
10651 case PyUnicode_1BYTE_KIND:
10652 {
10653 switch(kind2) {
10654 case PyUnicode_1BYTE_KIND:
10655 {
10656 int cmp = memcmp(data1, data2, len);
10657 /* normalize result of memcmp() into the range [-1; 1] */
10658 if (cmp < 0)
10659 return -1;
10660 if (cmp > 0)
10661 return 1;
10662 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010663 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010664 case PyUnicode_2BYTE_KIND:
10665 COMPARE(Py_UCS1, Py_UCS2);
10666 break;
10667 case PyUnicode_4BYTE_KIND:
10668 COMPARE(Py_UCS1, Py_UCS4);
10669 break;
10670 default:
10671 assert(0);
10672 }
10673 break;
10674 }
10675 case PyUnicode_2BYTE_KIND:
10676 {
10677 switch(kind2) {
10678 case PyUnicode_1BYTE_KIND:
10679 COMPARE(Py_UCS2, Py_UCS1);
10680 break;
10681 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010682 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010683 COMPARE(Py_UCS2, Py_UCS2);
10684 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010685 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010686 case PyUnicode_4BYTE_KIND:
10687 COMPARE(Py_UCS2, Py_UCS4);
10688 break;
10689 default:
10690 assert(0);
10691 }
10692 break;
10693 }
10694 case PyUnicode_4BYTE_KIND:
10695 {
10696 switch(kind2) {
10697 case PyUnicode_1BYTE_KIND:
10698 COMPARE(Py_UCS4, Py_UCS1);
10699 break;
10700 case PyUnicode_2BYTE_KIND:
10701 COMPARE(Py_UCS4, Py_UCS2);
10702 break;
10703 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010704 {
10705#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10706 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10707 /* normalize result of wmemcmp() into the range [-1; 1] */
10708 if (cmp < 0)
10709 return -1;
10710 if (cmp > 0)
10711 return 1;
10712#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010713 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010714#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010715 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010716 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010717 default:
10718 assert(0);
10719 }
10720 break;
10721 }
10722 default:
10723 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010724 }
10725
Victor Stinner770e19e2012-10-04 22:59:45 +020010726 if (len1 == len2)
10727 return 0;
10728 if (len1 < len2)
10729 return -1;
10730 else
10731 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010732
10733#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010734}
10735
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010736Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010737unicode_compare_eq(PyObject *str1, PyObject *str2)
10738{
10739 int kind;
10740 void *data1, *data2;
10741 Py_ssize_t len;
10742 int cmp;
10743
Victor Stinnere5567ad2012-10-23 02:48:49 +020010744 len = PyUnicode_GET_LENGTH(str1);
10745 if (PyUnicode_GET_LENGTH(str2) != len)
10746 return 0;
10747 kind = PyUnicode_KIND(str1);
10748 if (PyUnicode_KIND(str2) != kind)
10749 return 0;
10750 data1 = PyUnicode_DATA(str1);
10751 data2 = PyUnicode_DATA(str2);
10752
10753 cmp = memcmp(data1, data2, len * kind);
10754 return (cmp == 0);
10755}
10756
10757
Alexander Belopolsky40018472011-02-26 01:02:56 +000010758int
10759PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10762 if (PyUnicode_READY(left) == -1 ||
10763 PyUnicode_READY(right) == -1)
10764 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010765
10766 /* a string is equal to itself */
10767 if (left == right)
10768 return 0;
10769
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010770 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010772 PyErr_Format(PyExc_TypeError,
10773 "Can't compare %.100s and %.100s",
10774 left->ob_type->tp_name,
10775 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776 return -1;
10777}
10778
Martin v. Löwis5b222132007-06-10 09:51:05 +000010779int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010780_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10781{
10782 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10783 if (right_str == NULL)
10784 return -1;
10785 return PyUnicode_Compare(left, right_str);
10786}
10787
10788int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010789PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10790{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 Py_ssize_t i;
10792 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 Py_UCS4 chr;
10794
Victor Stinner910337b2011-10-03 03:20:16 +020010795 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 if (PyUnicode_READY(uni) == -1)
10797 return -1;
10798 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010799 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010800 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010801 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010802 size_t len, len2 = strlen(str);
10803 int cmp;
10804
10805 len = Py_MIN(len1, len2);
10806 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010807 if (cmp != 0) {
10808 if (cmp < 0)
10809 return -1;
10810 else
10811 return 1;
10812 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010813 if (len1 > len2)
10814 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010815 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010816 return -1; /* str is longer */
10817 return 0;
10818 }
10819 else {
10820 void *data = PyUnicode_DATA(uni);
10821 /* Compare Unicode string and source character set string */
10822 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010823 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010824 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10825 /* This check keeps Python strings that end in '\0' from comparing equal
10826 to C strings identical up to that point. */
10827 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10828 return 1; /* uni is longer */
10829 if (str[i])
10830 return -1; /* str is longer */
10831 return 0;
10832 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010833}
10834
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010835
Benjamin Peterson29060642009-01-31 22:14:21 +000010836#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010837 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010838
Alexander Belopolsky40018472011-02-26 01:02:56 +000010839PyObject *
10840PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010841{
10842 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010843 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010844
Victor Stinnere5567ad2012-10-23 02:48:49 +020010845 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10846 Py_RETURN_NOTIMPLEMENTED;
10847
10848 if (PyUnicode_READY(left) == -1 ||
10849 PyUnicode_READY(right) == -1)
10850 return NULL;
10851
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010852 if (left == right) {
10853 switch (op) {
10854 case Py_EQ:
10855 case Py_LE:
10856 case Py_GE:
10857 /* a string is equal to itself */
10858 v = Py_True;
10859 break;
10860 case Py_NE:
10861 case Py_LT:
10862 case Py_GT:
10863 v = Py_False;
10864 break;
10865 default:
10866 PyErr_BadArgument();
10867 return NULL;
10868 }
10869 }
10870 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010871 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010872 result ^= (op == Py_NE);
10873 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010874 }
10875 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010876 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010877
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010878 /* Convert the return value to a Boolean */
10879 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010880 case Py_LE:
10881 v = TEST_COND(result <= 0);
10882 break;
10883 case Py_GE:
10884 v = TEST_COND(result >= 0);
10885 break;
10886 case Py_LT:
10887 v = TEST_COND(result == -1);
10888 break;
10889 case Py_GT:
10890 v = TEST_COND(result == 1);
10891 break;
10892 default:
10893 PyErr_BadArgument();
10894 return NULL;
10895 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010896 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010897 Py_INCREF(v);
10898 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010899}
10900
Alexander Belopolsky40018472011-02-26 01:02:56 +000010901int
10902PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010903{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010904 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010905 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906 void *buf1, *buf2;
10907 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010908 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010909
10910 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010911 sub = PyUnicode_FromObject(element);
10912 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010913 PyErr_Format(PyExc_TypeError,
10914 "'in <string>' requires string as left operand, not %s",
10915 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010916 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010917 }
10918
Thomas Wouters477c8d52006-05-27 19:21:47 +000010919 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010920 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010921 Py_DECREF(sub);
10922 return -1;
10923 }
10924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 kind1 = PyUnicode_KIND(str);
10926 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010927 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010929 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010930 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 }
10932 len1 = PyUnicode_GET_LENGTH(str);
10933 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010934 if (len1 < len2) {
10935 Py_DECREF(sub);
10936 Py_DECREF(str);
10937 return 0;
10938 }
10939 buf1 = PyUnicode_DATA(str);
10940 buf2 = PyUnicode_DATA(sub);
10941 if (len2 == 1) {
10942 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10943 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10944 Py_DECREF(sub);
10945 Py_DECREF(str);
10946 return result;
10947 }
10948 if (kind2 != kind1) {
10949 buf2 = _PyUnicode_AsKind(sub, kind1);
10950 if (!buf2) {
10951 Py_DECREF(sub);
10952 Py_DECREF(str);
10953 return -1;
10954 }
10955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956
Victor Stinner77282cb2013-04-14 19:22:47 +020010957 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 case PyUnicode_1BYTE_KIND:
10959 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10960 break;
10961 case PyUnicode_2BYTE_KIND:
10962 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10963 break;
10964 case PyUnicode_4BYTE_KIND:
10965 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10966 break;
10967 default:
10968 result = -1;
10969 assert(0);
10970 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010971
10972 Py_DECREF(str);
10973 Py_DECREF(sub);
10974
Victor Stinner77282cb2013-04-14 19:22:47 +020010975 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 PyMem_Free(buf2);
10977
Guido van Rossum403d68b2000-03-13 15:55:09 +000010978 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010979}
10980
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981/* Concat to string or Unicode object giving a new Unicode object. */
10982
Alexander Belopolsky40018472011-02-26 01:02:56 +000010983PyObject *
10984PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010987 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010988 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989
10990 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010996 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997
10998 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010999 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011003 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011004 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 }
11007
Victor Stinner488fa492011-12-12 00:01:39 +010011008 u_len = PyUnicode_GET_LENGTH(u);
11009 v_len = PyUnicode_GET_LENGTH(v);
11010 if (u_len > PY_SSIZE_T_MAX - v_len) {
11011 PyErr_SetString(PyExc_OverflowError,
11012 "strings are too large to concat");
11013 goto onError;
11014 }
11015 new_len = u_len + v_len;
11016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011018 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011019 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011022 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011024 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011025 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11026 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 Py_DECREF(u);
11028 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011029 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031
Benjamin Peterson29060642009-01-31 22:14:21 +000011032 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033 Py_XDECREF(u);
11034 Py_XDECREF(v);
11035 return NULL;
11036}
11037
Walter Dörwald1ab83302007-05-18 17:15:44 +000011038void
Victor Stinner23e56682011-10-03 03:54:37 +020011039PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011040{
Victor Stinner23e56682011-10-03 03:54:37 +020011041 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011042 Py_UCS4 maxchar, maxchar2;
11043 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011044
11045 if (p_left == NULL) {
11046 if (!PyErr_Occurred())
11047 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011048 return;
11049 }
Victor Stinner23e56682011-10-03 03:54:37 +020011050 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011051 if (right == NULL || left == NULL
11052 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011053 if (!PyErr_Occurred())
11054 PyErr_BadInternalCall();
11055 goto error;
11056 }
11057
Benjamin Petersonbac79492012-01-14 13:34:47 -050011058 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011059 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011060 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011061 goto error;
11062
Victor Stinner488fa492011-12-12 00:01:39 +010011063 /* Shortcuts */
11064 if (left == unicode_empty) {
11065 Py_DECREF(left);
11066 Py_INCREF(right);
11067 *p_left = right;
11068 return;
11069 }
11070 if (right == unicode_empty)
11071 return;
11072
11073 left_len = PyUnicode_GET_LENGTH(left);
11074 right_len = PyUnicode_GET_LENGTH(right);
11075 if (left_len > PY_SSIZE_T_MAX - right_len) {
11076 PyErr_SetString(PyExc_OverflowError,
11077 "strings are too large to concat");
11078 goto error;
11079 }
11080 new_len = left_len + right_len;
11081
11082 if (unicode_modifiable(left)
11083 && PyUnicode_CheckExact(right)
11084 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011085 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11086 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011087 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011088 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011089 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11090 {
11091 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011092 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011093 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011094
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011095 /* copy 'right' into the newly allocated area of 'left' */
11096 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011097 }
Victor Stinner488fa492011-12-12 00:01:39 +010011098 else {
11099 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11100 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011101 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011102
Victor Stinner488fa492011-12-12 00:01:39 +010011103 /* Concat the two Unicode strings */
11104 res = PyUnicode_New(new_len, maxchar);
11105 if (res == NULL)
11106 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011107 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11108 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011109 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011110 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011111 }
11112 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011113 return;
11114
11115error:
Victor Stinner488fa492011-12-12 00:01:39 +010011116 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011117}
11118
11119void
11120PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11121{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011122 PyUnicode_Append(pleft, right);
11123 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011124}
11125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011126PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011127 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011129Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011130string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011131interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132
11133static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011134unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011136 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011137 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011138 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011140 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 void *buf1, *buf2;
11142 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143
Jesus Ceaac451502011-04-20 17:09:23 +020011144 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11145 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 kind1 = PyUnicode_KIND(self);
11149 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011150 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011151 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011152 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 len1 = PyUnicode_GET_LENGTH(self);
11155 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011157 if (end - start < len2) {
11158 Py_DECREF(substring);
11159 return PyLong_FromLong(0);
11160 }
11161 buf1 = PyUnicode_DATA(self);
11162 buf2 = PyUnicode_DATA(substring);
11163 if (kind2 != kind1) {
11164 buf2 = _PyUnicode_AsKind(substring, kind1);
11165 if (!buf2) {
11166 Py_DECREF(substring);
11167 return NULL;
11168 }
11169 }
11170 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 case PyUnicode_1BYTE_KIND:
11172 iresult = ucs1lib_count(
11173 ((Py_UCS1*)buf1) + start, end - start,
11174 buf2, len2, PY_SSIZE_T_MAX
11175 );
11176 break;
11177 case PyUnicode_2BYTE_KIND:
11178 iresult = ucs2lib_count(
11179 ((Py_UCS2*)buf1) + start, end - start,
11180 buf2, len2, PY_SSIZE_T_MAX
11181 );
11182 break;
11183 case PyUnicode_4BYTE_KIND:
11184 iresult = ucs4lib_count(
11185 ((Py_UCS4*)buf1) + start, end - start,
11186 buf2, len2, PY_SSIZE_T_MAX
11187 );
11188 break;
11189 default:
11190 assert(0); iresult = 0;
11191 }
11192
11193 result = PyLong_FromSsize_t(iresult);
11194
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011195 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
11198 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011199
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200 return result;
11201}
11202
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011203PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011204 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011206Encode S using the codec registered for encoding. Default encoding\n\
11207is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011208handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011209a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11210'xmlcharrefreplace' as well as any other name registered with\n\
11211codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
11213static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011214unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011216 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217 char *encoding = NULL;
11218 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011219
Benjamin Peterson308d6372009-09-18 21:42:35 +000011220 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11221 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011223 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011224}
11225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011227 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228\n\
11229Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011230If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231
11232static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011233unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011235 Py_ssize_t i, j, line_pos, src_len, incr;
11236 Py_UCS4 ch;
11237 PyObject *u;
11238 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011239 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011241 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011242 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243
Ezio Melotti745d54d2013-11-16 19:10:57 +020011244 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11245 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
Antoine Pitrou22425222011-10-04 19:10:51 +020011248 if (PyUnicode_READY(self) == -1)
11249 return NULL;
11250
Thomas Wouters7e474022000-07-16 12:04:32 +000011251 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011252 src_len = PyUnicode_GET_LENGTH(self);
11253 i = j = line_pos = 0;
11254 kind = PyUnicode_KIND(self);
11255 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011256 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011257 for (; i < src_len; i++) {
11258 ch = PyUnicode_READ(kind, src_data, i);
11259 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011260 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011262 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011264 goto overflow;
11265 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011267 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011271 goto overflow;
11272 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011274 if (ch == '\n' || ch == '\r')
11275 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011277 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011278 if (!found)
11279 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011280
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011282 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283 if (!u)
11284 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011285 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
Antoine Pitroue71d5742011-10-04 15:55:09 +020011287 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288
Antoine Pitroue71d5742011-10-04 15:55:09 +020011289 for (; i < src_len; i++) {
11290 ch = PyUnicode_READ(kind, src_data, i);
11291 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011293 incr = tabsize - (line_pos % tabsize);
11294 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011295 FILL(kind, dest_data, ' ', j, incr);
11296 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011297 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011298 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011300 line_pos++;
11301 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011302 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011303 if (ch == '\n' || ch == '\r')
11304 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011306 }
11307 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011308 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011309
Antoine Pitroue71d5742011-10-04 15:55:09 +020011310 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011311 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313}
11314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011315PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011316 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317\n\
11318Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011319such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320arguments start and end are interpreted as in slice notation.\n\
11321\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011322Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323
11324static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011327 /* initialize variables to prevent gcc warning */
11328 PyObject *substring = NULL;
11329 Py_ssize_t start = 0;
11330 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011331 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332
Jesus Ceaac451502011-04-20 17:09:23 +020011333 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11334 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336
Christian Heimesd47802e2013-06-29 21:33:36 +020011337 if (PyUnicode_READY(self) == -1) {
11338 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011340 }
11341 if (PyUnicode_READY(substring) == -1) {
11342 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345
Victor Stinner7931d9a2011-11-04 00:22:48 +010011346 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347
11348 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 if (result == -2)
11351 return NULL;
11352
Christian Heimes217cfd12007-12-02 14:31:20 +000011353 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354}
11355
11356static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011357unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011359 void *data;
11360 enum PyUnicode_Kind kind;
11361 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011362
11363 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11364 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011366 }
11367 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11368 PyErr_SetString(PyExc_IndexError, "string index out of range");
11369 return NULL;
11370 }
11371 kind = PyUnicode_KIND(self);
11372 data = PyUnicode_DATA(self);
11373 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011374 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375}
11376
Guido van Rossumc2504932007-09-18 19:42:40 +000011377/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011378 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011379static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011380unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381{
Guido van Rossumc2504932007-09-18 19:42:40 +000011382 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011383 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011384
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011385#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011386 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011387#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 if (_PyUnicode_HASH(self) != -1)
11389 return _PyUnicode_HASH(self);
11390 if (PyUnicode_READY(self) == -1)
11391 return -1;
11392 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011393 /*
11394 We make the hash of the empty string be 0, rather than using
11395 (prefix ^ suffix), since this slightly obfuscates the hash secret
11396 */
11397 if (len == 0) {
11398 _PyUnicode_HASH(self) = 0;
11399 return 0;
11400 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011401 x = _Py_HashBytes(PyUnicode_DATA(self),
11402 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011404 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405}
11406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011407PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411
11412static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011415 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011416 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011417 PyObject *substring = NULL;
11418 Py_ssize_t start = 0;
11419 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420
Jesus Ceaac451502011-04-20 17:09:23 +020011421 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11422 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424
Christian Heimesd47a0452013-06-29 21:21:37 +020011425 if (PyUnicode_READY(self) == -1) {
11426 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011428 }
11429 if (PyUnicode_READY(substring) == -1) {
11430 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433
Victor Stinner7931d9a2011-11-04 00:22:48 +010011434 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
11436 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011438 if (result == -2)
11439 return NULL;
11440
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441 if (result < 0) {
11442 PyErr_SetString(PyExc_ValueError, "substring not found");
11443 return NULL;
11444 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011445
Christian Heimes217cfd12007-12-02 14:31:20 +000011446 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447}
11448
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011449PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011452Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011453at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454
11455static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011456unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 Py_ssize_t i, length;
11459 int kind;
11460 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461 int cased;
11462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (PyUnicode_READY(self) == -1)
11464 return NULL;
11465 length = PyUnicode_GET_LENGTH(self);
11466 kind = PyUnicode_KIND(self);
11467 data = PyUnicode_DATA(self);
11468
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 if (length == 1)
11471 return PyBool_FromLong(
11472 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011474 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011476 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011477
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 for (i = 0; i < length; i++) {
11480 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011481
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11483 return PyBool_FromLong(0);
11484 else if (!cased && Py_UNICODE_ISLOWER(ch))
11485 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011487 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488}
11489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011490PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011493Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011494at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
11496static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011497unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 Py_ssize_t i, length;
11500 int kind;
11501 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502 int cased;
11503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 if (PyUnicode_READY(self) == -1)
11505 return NULL;
11506 length = PyUnicode_GET_LENGTH(self);
11507 kind = PyUnicode_KIND(self);
11508 data = PyUnicode_DATA(self);
11509
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 if (length == 1)
11512 return PyBool_FromLong(
11513 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011515 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011518
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 for (i = 0; i < length; i++) {
11521 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011522
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11524 return PyBool_FromLong(0);
11525 else if (!cased && Py_UNICODE_ISUPPER(ch))
11526 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011528 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529}
11530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011531PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011532 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011533\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011534Return True if S is a titlecased string and there is at least one\n\
11535character in S, i.e. upper- and titlecase characters may only\n\
11536follow uncased characters and lowercase characters only cased ones.\n\
11537Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538
11539static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011540unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 Py_ssize_t i, length;
11543 int kind;
11544 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545 int cased, previous_is_cased;
11546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 if (PyUnicode_READY(self) == -1)
11548 return NULL;
11549 length = PyUnicode_GET_LENGTH(self);
11550 kind = PyUnicode_KIND(self);
11551 data = PyUnicode_DATA(self);
11552
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 if (length == 1) {
11555 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11556 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11557 (Py_UNICODE_ISUPPER(ch) != 0));
11558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011560 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011563
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564 cased = 0;
11565 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 for (i = 0; i < length; i++) {
11567 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011568
Benjamin Peterson29060642009-01-31 22:14:21 +000011569 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11570 if (previous_is_cased)
11571 return PyBool_FromLong(0);
11572 previous_is_cased = 1;
11573 cased = 1;
11574 }
11575 else if (Py_UNICODE_ISLOWER(ch)) {
11576 if (!previous_is_cased)
11577 return PyBool_FromLong(0);
11578 previous_is_cased = 1;
11579 cased = 1;
11580 }
11581 else
11582 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011584 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585}
11586
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011587PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011590Return True if all characters in S are whitespace\n\
11591and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592
11593static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011594unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 Py_ssize_t i, length;
11597 int kind;
11598 void *data;
11599
11600 if (PyUnicode_READY(self) == -1)
11601 return NULL;
11602 length = PyUnicode_GET_LENGTH(self);
11603 kind = PyUnicode_KIND(self);
11604 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 if (length == 1)
11608 return PyBool_FromLong(
11609 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011611 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011613 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 for (i = 0; i < length; i++) {
11616 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011617 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011618 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011620 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621}
11622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011623PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011624 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011625\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011626Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011627and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011628
11629static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011630unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 Py_ssize_t i, length;
11633 int kind;
11634 void *data;
11635
11636 if (PyUnicode_READY(self) == -1)
11637 return NULL;
11638 length = PyUnicode_GET_LENGTH(self);
11639 kind = PyUnicode_KIND(self);
11640 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011641
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011642 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 if (length == 1)
11644 return PyBool_FromLong(
11645 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011646
11647 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651 for (i = 0; i < length; i++) {
11652 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011654 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011655 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011656}
11657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011658PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011660\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011661Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011662and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011663
11664static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011665unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 int kind;
11668 void *data;
11669 Py_ssize_t len, i;
11670
11671 if (PyUnicode_READY(self) == -1)
11672 return NULL;
11673
11674 kind = PyUnicode_KIND(self);
11675 data = PyUnicode_DATA(self);
11676 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011677
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011678 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 if (len == 1) {
11680 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11681 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11682 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011683
11684 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011686 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 for (i = 0; i < len; i++) {
11689 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011690 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011691 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011692 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011693 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011694}
11695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011696PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011699Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011700False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701
11702static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011703unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 Py_ssize_t i, length;
11706 int kind;
11707 void *data;
11708
11709 if (PyUnicode_READY(self) == -1)
11710 return NULL;
11711 length = PyUnicode_GET_LENGTH(self);
11712 kind = PyUnicode_KIND(self);
11713 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 if (length == 1)
11717 return PyBool_FromLong(
11718 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011720 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724 for (i = 0; i < length; i++) {
11725 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011728 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729}
11730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011731PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011732 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011734Return True if all characters in S are digits\n\
11735and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736
11737static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011738unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 Py_ssize_t i, length;
11741 int kind;
11742 void *data;
11743
11744 if (PyUnicode_READY(self) == -1)
11745 return NULL;
11746 length = PyUnicode_GET_LENGTH(self);
11747 kind = PyUnicode_KIND(self);
11748 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 if (length == 1) {
11752 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11753 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011756 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 for (i = 0; i < length; i++) {
11761 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011764 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765}
11766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011767PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011770Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011771False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
11773static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011774unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 Py_ssize_t i, length;
11777 int kind;
11778 void *data;
11779
11780 if (PyUnicode_READY(self) == -1)
11781 return NULL;
11782 length = PyUnicode_GET_LENGTH(self);
11783 kind = PyUnicode_KIND(self);
11784 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 if (length == 1)
11788 return PyBool_FromLong(
11789 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011791 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011793 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 for (i = 0; i < length; i++) {
11796 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011797 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011799 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800}
11801
Martin v. Löwis47383402007-08-15 07:32:56 +000011802int
11803PyUnicode_IsIdentifier(PyObject *self)
11804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 int kind;
11806 void *data;
11807 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011808 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 if (PyUnicode_READY(self) == -1) {
11811 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011812 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 }
11814
11815 /* Special case for empty strings */
11816 if (PyUnicode_GET_LENGTH(self) == 0)
11817 return 0;
11818 kind = PyUnicode_KIND(self);
11819 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011820
11821 /* PEP 3131 says that the first character must be in
11822 XID_Start and subsequent characters in XID_Continue,
11823 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011824 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011825 letters, digits, underscore). However, given the current
11826 definition of XID_Start and XID_Continue, it is sufficient
11827 to check just for these, except that _ must be allowed
11828 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011830 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011831 return 0;
11832
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011833 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011834 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011836 return 1;
11837}
11838
11839PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011840 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011841\n\
11842Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011843to the language definition.\n\
11844\n\
11845Use keyword.iskeyword() to test for reserved identifiers\n\
11846such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011847
11848static PyObject*
11849unicode_isidentifier(PyObject *self)
11850{
11851 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11852}
11853
Georg Brandl559e5d72008-06-11 18:37:52 +000011854PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011855 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011856\n\
11857Return True if all characters in S are considered\n\
11858printable in repr() or S is empty, False otherwise.");
11859
11860static PyObject*
11861unicode_isprintable(PyObject *self)
11862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 Py_ssize_t i, length;
11864 int kind;
11865 void *data;
11866
11867 if (PyUnicode_READY(self) == -1)
11868 return NULL;
11869 length = PyUnicode_GET_LENGTH(self);
11870 kind = PyUnicode_KIND(self);
11871 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011872
11873 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 if (length == 1)
11875 return PyBool_FromLong(
11876 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 for (i = 0; i < length; i++) {
11879 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011880 Py_RETURN_FALSE;
11881 }
11882 }
11883 Py_RETURN_TRUE;
11884}
11885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011886PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011887 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888\n\
11889Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011890iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891
11892static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011893unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011895 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896}
11897
Martin v. Löwis18e16552006-02-15 17:27:45 +000011898static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011899unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 if (PyUnicode_READY(self) == -1)
11902 return -1;
11903 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904}
11905
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011906PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011909Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011910done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911
11912static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011913unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011915 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 Py_UCS4 fillchar = ' ';
11917
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011918 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 return NULL;
11920
Benjamin Petersonbac79492012-01-14 13:34:47 -050011921 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923
Victor Stinnerc4b49542011-12-11 22:44:26 +010011924 if (PyUnicode_GET_LENGTH(self) >= width)
11925 return unicode_result_unchanged(self);
11926
11927 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928}
11929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011930PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011933Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934
11935static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011936unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011938 if (PyUnicode_READY(self) == -1)
11939 return NULL;
11940 if (PyUnicode_IS_ASCII(self))
11941 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011942 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943}
11944
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011945#define LEFTSTRIP 0
11946#define RIGHTSTRIP 1
11947#define BOTHSTRIP 2
11948
11949/* Arrays indexed by above */
11950static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11951
11952#define STRIPNAME(i) (stripformat[i]+3)
11953
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011954/* externally visible for str.strip(unicode) */
11955PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011956_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011957{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 void *data;
11959 int kind;
11960 Py_ssize_t i, j, len;
11961 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011962 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11965 return NULL;
11966
11967 kind = PyUnicode_KIND(self);
11968 data = PyUnicode_DATA(self);
11969 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011970 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11972 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011973 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011974
Benjamin Peterson14339b62009-01-31 16:36:08 +000011975 i = 0;
11976 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011977 while (i < len) {
11978 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11979 if (!BLOOM(sepmask, ch))
11980 break;
11981 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11982 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011983 i++;
11984 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011985 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011986
Benjamin Peterson14339b62009-01-31 16:36:08 +000011987 j = len;
11988 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011989 j--;
11990 while (j >= i) {
11991 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11992 if (!BLOOM(sepmask, ch))
11993 break;
11994 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11995 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011997 }
11998
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012000 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012001
Victor Stinner7931d9a2011-11-04 00:22:48 +010012002 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003}
12004
12005PyObject*
12006PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12007{
12008 unsigned char *data;
12009 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012010 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011
Victor Stinnerde636f32011-10-01 03:55:54 +020012012 if (PyUnicode_READY(self) == -1)
12013 return NULL;
12014
Victor Stinner684d5fd2012-05-03 02:32:34 +020012015 length = PyUnicode_GET_LENGTH(self);
12016 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012017
Victor Stinner684d5fd2012-05-03 02:32:34 +020012018 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012019 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020
Victor Stinnerde636f32011-10-01 03:55:54 +020012021 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012022 PyErr_SetString(PyExc_IndexError, "string index out of range");
12023 return NULL;
12024 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012025 if (start >= length || end < start)
12026 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012027
Victor Stinner684d5fd2012-05-03 02:32:34 +020012028 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012029 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012030 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012031 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012032 }
12033 else {
12034 kind = PyUnicode_KIND(self);
12035 data = PyUnicode_1BYTE_DATA(self);
12036 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012037 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012038 length);
12039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041
12042static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012043do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 Py_ssize_t len, i, j;
12046
12047 if (PyUnicode_READY(self) == -1)
12048 return NULL;
12049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012050 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012051
Victor Stinnercc7af722013-04-09 22:39:24 +020012052 if (PyUnicode_IS_ASCII(self)) {
12053 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12054
12055 i = 0;
12056 if (striptype != RIGHTSTRIP) {
12057 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012058 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012059 if (!_Py_ascii_whitespace[ch])
12060 break;
12061 i++;
12062 }
12063 }
12064
12065 j = len;
12066 if (striptype != LEFTSTRIP) {
12067 j--;
12068 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012069 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012070 if (!_Py_ascii_whitespace[ch])
12071 break;
12072 j--;
12073 }
12074 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012075 }
12076 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012077 else {
12078 int kind = PyUnicode_KIND(self);
12079 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012080
Victor Stinnercc7af722013-04-09 22:39:24 +020012081 i = 0;
12082 if (striptype != RIGHTSTRIP) {
12083 while (i < len) {
12084 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12085 if (!Py_UNICODE_ISSPACE(ch))
12086 break;
12087 i++;
12088 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012089 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012090
12091 j = len;
12092 if (striptype != LEFTSTRIP) {
12093 j--;
12094 while (j >= i) {
12095 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12096 if (!Py_UNICODE_ISSPACE(ch))
12097 break;
12098 j--;
12099 }
12100 j++;
12101 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012102 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012103
Victor Stinner7931d9a2011-11-04 00:22:48 +010012104 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105}
12106
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012107
12108static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012109do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012110{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012111 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012112
Serhiy Storchakac6792272013-10-19 21:03:34 +030012113 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012114 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012115
Benjamin Peterson14339b62009-01-31 16:36:08 +000012116 if (sep != NULL && sep != Py_None) {
12117 if (PyUnicode_Check(sep))
12118 return _PyUnicode_XStrip(self, striptype, sep);
12119 else {
12120 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012121 "%s arg must be None or str",
12122 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012123 return NULL;
12124 }
12125 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012126
Benjamin Peterson14339b62009-01-31 16:36:08 +000012127 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012128}
12129
12130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012131PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012132 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012133\n\
12134Return a copy of the string S with leading and trailing\n\
12135whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012136If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137
12138static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012139unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012140{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012141 if (PyTuple_GET_SIZE(args) == 0)
12142 return do_strip(self, BOTHSTRIP); /* Common case */
12143 else
12144 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012145}
12146
12147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012148PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012149 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012150\n\
12151Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012152If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012153
12154static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012155unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012156{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012157 if (PyTuple_GET_SIZE(args) == 0)
12158 return do_strip(self, LEFTSTRIP); /* Common case */
12159 else
12160 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012161}
12162
12163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012164PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012165 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012166\n\
12167Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012168If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012169
12170static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012171unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012172{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012173 if (PyTuple_GET_SIZE(args) == 0)
12174 return do_strip(self, RIGHTSTRIP); /* Common case */
12175 else
12176 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012177}
12178
12179
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012181unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012183 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185
Serhiy Storchaka05997252013-01-26 12:14:02 +020012186 if (len < 1)
12187 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188
Victor Stinnerc4b49542011-12-11 22:44:26 +010012189 /* no repeat, return original string */
12190 if (len == 1)
12191 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012192
Benjamin Petersonbac79492012-01-14 13:34:47 -050012193 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 return NULL;
12195
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012196 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012197 PyErr_SetString(PyExc_OverflowError,
12198 "repeated string is too long");
12199 return NULL;
12200 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012202
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012203 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204 if (!u)
12205 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012206 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 if (PyUnicode_GET_LENGTH(str) == 1) {
12209 const int kind = PyUnicode_KIND(str);
12210 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012211 if (kind == PyUnicode_1BYTE_KIND) {
12212 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012213 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012214 }
12215 else if (kind == PyUnicode_2BYTE_KIND) {
12216 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012217 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012218 ucs2[n] = fill_char;
12219 } else {
12220 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12221 assert(kind == PyUnicode_4BYTE_KIND);
12222 for (n = 0; n < len; ++n)
12223 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 }
12226 else {
12227 /* number of characters copied this far */
12228 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012229 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 char *to = (char *) PyUnicode_DATA(u);
12231 Py_MEMCPY(to, PyUnicode_DATA(str),
12232 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012233 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 n = (done <= nchars-done) ? done : nchars-done;
12235 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012236 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238 }
12239
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012240 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012241 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242}
12243
Alexander Belopolsky40018472011-02-26 01:02:56 +000012244PyObject *
12245PyUnicode_Replace(PyObject *obj,
12246 PyObject *subobj,
12247 PyObject *replobj,
12248 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249{
12250 PyObject *self;
12251 PyObject *str1;
12252 PyObject *str2;
12253 PyObject *result;
12254
12255 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012256 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012259 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 Py_DECREF(self);
12261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262 }
12263 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012264 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012265 Py_DECREF(self);
12266 Py_DECREF(str1);
12267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012269 if (PyUnicode_READY(self) == -1 ||
12270 PyUnicode_READY(str1) == -1 ||
12271 PyUnicode_READY(str2) == -1)
12272 result = NULL;
12273 else
12274 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275 Py_DECREF(self);
12276 Py_DECREF(str1);
12277 Py_DECREF(str2);
12278 return result;
12279}
12280
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012281PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012282 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283\n\
12284Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012285old replaced by new. If the optional argument count is\n\
12286given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287
12288static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012289unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 PyObject *str1;
12292 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012293 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294 PyObject *result;
12295
Martin v. Löwis18e16552006-02-15 17:27:45 +000012296 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012298 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012301 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 return NULL;
12303 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012304 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 Py_DECREF(str1);
12306 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012307 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012308 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12309 result = NULL;
12310 else
12311 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312
12313 Py_DECREF(str1);
12314 Py_DECREF(str2);
12315 return result;
12316}
12317
Alexander Belopolsky40018472011-02-26 01:02:56 +000012318static PyObject *
12319unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012321 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 Py_ssize_t isize;
12323 Py_ssize_t osize, squote, dquote, i, o;
12324 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012325 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012329 return NULL;
12330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 isize = PyUnicode_GET_LENGTH(unicode);
12332 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 /* Compute length of output, quote characters, and
12335 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012336 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 max = 127;
12338 squote = dquote = 0;
12339 ikind = PyUnicode_KIND(unicode);
12340 for (i = 0; i < isize; i++) {
12341 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012342 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012344 case '\'': squote++; break;
12345 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012347 incr = 2;
12348 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 default:
12350 /* Fast-path ASCII */
12351 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012352 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012354 ;
12355 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012358 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012360 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012362 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012364 if (osize > PY_SSIZE_T_MAX - incr) {
12365 PyErr_SetString(PyExc_OverflowError,
12366 "string is too long to generate repr");
12367 return NULL;
12368 }
12369 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 }
12371
12372 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012373 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012375 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 if (dquote)
12377 /* Both squote and dquote present. Use squote,
12378 and escape them */
12379 osize += squote;
12380 else
12381 quote = '"';
12382 }
Victor Stinner55c08782013-04-14 18:45:39 +020012383 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384
12385 repr = PyUnicode_New(osize, max);
12386 if (repr == NULL)
12387 return NULL;
12388 okind = PyUnicode_KIND(repr);
12389 odata = PyUnicode_DATA(repr);
12390
12391 PyUnicode_WRITE(okind, odata, 0, quote);
12392 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012393 if (unchanged) {
12394 _PyUnicode_FastCopyCharacters(repr, 1,
12395 unicode, 0,
12396 isize);
12397 }
12398 else {
12399 for (i = 0, o = 1; i < isize; i++) {
12400 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401
Victor Stinner55c08782013-04-14 18:45:39 +020012402 /* Escape quotes and backslashes */
12403 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012404 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012406 continue;
12407 }
12408
12409 /* Map special whitespace to '\t', \n', '\r' */
12410 if (ch == '\t') {
12411 PyUnicode_WRITE(okind, odata, o++, '\\');
12412 PyUnicode_WRITE(okind, odata, o++, 't');
12413 }
12414 else if (ch == '\n') {
12415 PyUnicode_WRITE(okind, odata, o++, '\\');
12416 PyUnicode_WRITE(okind, odata, o++, 'n');
12417 }
12418 else if (ch == '\r') {
12419 PyUnicode_WRITE(okind, odata, o++, '\\');
12420 PyUnicode_WRITE(okind, odata, o++, 'r');
12421 }
12422
12423 /* Map non-printable US ASCII to '\xhh' */
12424 else if (ch < ' ' || ch == 0x7F) {
12425 PyUnicode_WRITE(okind, odata, o++, '\\');
12426 PyUnicode_WRITE(okind, odata, o++, 'x');
12427 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12428 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12429 }
12430
12431 /* Copy ASCII characters as-is */
12432 else if (ch < 0x7F) {
12433 PyUnicode_WRITE(okind, odata, o++, ch);
12434 }
12435
12436 /* Non-ASCII characters */
12437 else {
12438 /* Map Unicode whitespace and control characters
12439 (categories Z* and C* except ASCII space)
12440 */
12441 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12442 PyUnicode_WRITE(okind, odata, o++, '\\');
12443 /* Map 8-bit characters to '\xhh' */
12444 if (ch <= 0xff) {
12445 PyUnicode_WRITE(okind, odata, o++, 'x');
12446 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12447 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12448 }
12449 /* Map 16-bit characters to '\uxxxx' */
12450 else if (ch <= 0xffff) {
12451 PyUnicode_WRITE(okind, odata, o++, 'u');
12452 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12455 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12456 }
12457 /* Map 21-bit characters to '\U00xxxxxx' */
12458 else {
12459 PyUnicode_WRITE(okind, odata, o++, 'U');
12460 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12461 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12462 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12463 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12464 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12465 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12466 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12467 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12468 }
12469 }
12470 /* Copy characters as-is */
12471 else {
12472 PyUnicode_WRITE(okind, odata, o++, ch);
12473 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012474 }
12475 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012478 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012479 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480}
12481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012482PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484\n\
12485Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012486such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487arguments start and end are interpreted as in slice notation.\n\
12488\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012489Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
12491static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012494 /* initialize variables to prevent gcc warning */
12495 PyObject *substring = NULL;
12496 Py_ssize_t start = 0;
12497 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012498 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499
Jesus Ceaac451502011-04-20 17:09:23 +020012500 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12501 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503
Christian Heimesea71a522013-06-29 21:17:34 +020012504 if (PyUnicode_READY(self) == -1) {
12505 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012507 }
12508 if (PyUnicode_READY(substring) == -1) {
12509 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012510 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012512
Victor Stinner7931d9a2011-11-04 00:22:48 +010012513 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514
12515 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517 if (result == -2)
12518 return NULL;
12519
Christian Heimes217cfd12007-12-02 14:31:20 +000012520 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521}
12522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012523PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012524 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012526Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527
12528static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012531 /* initialize variables to prevent gcc warning */
12532 PyObject *substring = NULL;
12533 Py_ssize_t start = 0;
12534 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012535 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536
Jesus Ceaac451502011-04-20 17:09:23 +020012537 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12538 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012539 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540
Christian Heimesea71a522013-06-29 21:17:34 +020012541 if (PyUnicode_READY(self) == -1) {
12542 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012544 }
12545 if (PyUnicode_READY(substring) == -1) {
12546 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012549
Victor Stinner7931d9a2011-11-04 00:22:48 +010012550 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551
12552 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 if (result == -2)
12555 return NULL;
12556
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557 if (result < 0) {
12558 PyErr_SetString(PyExc_ValueError, "substring not found");
12559 return NULL;
12560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561
Christian Heimes217cfd12007-12-02 14:31:20 +000012562 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563}
12564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012565PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012566 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012568Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012569done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570
12571static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012572unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012574 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 Py_UCS4 fillchar = ' ';
12576
Victor Stinnere9a29352011-10-01 02:14:59 +020012577 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012579
Benjamin Petersonbac79492012-01-14 13:34:47 -050012580 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581 return NULL;
12582
Victor Stinnerc4b49542011-12-11 22:44:26 +010012583 if (PyUnicode_GET_LENGTH(self) >= width)
12584 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585
Victor Stinnerc4b49542011-12-11 22:44:26 +010012586 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587}
12588
Alexander Belopolsky40018472011-02-26 01:02:56 +000012589PyObject *
12590PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591{
12592 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012593
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594 s = PyUnicode_FromObject(s);
12595 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012596 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012597 if (sep != NULL) {
12598 sep = PyUnicode_FromObject(sep);
12599 if (sep == NULL) {
12600 Py_DECREF(s);
12601 return NULL;
12602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603 }
12604
Victor Stinner9310abb2011-10-05 00:59:23 +020012605 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012606
12607 Py_DECREF(s);
12608 Py_XDECREF(sep);
12609 return result;
12610}
12611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012612PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012613 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614\n\
12615Return a list of the words in S, using sep as the\n\
12616delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012617splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012618whitespace string is a separator and empty strings are\n\
12619removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620
12621static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012622unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012624 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012626 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012628 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12629 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630 return NULL;
12631
12632 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012635 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012637 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638}
12639
Thomas Wouters477c8d52006-05-27 19:21:47 +000012640PyObject *
12641PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12642{
12643 PyObject* str_obj;
12644 PyObject* sep_obj;
12645 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012646 int kind1, kind2;
12647 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012649
12650 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012651 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012652 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012653 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012654 if (!sep_obj) {
12655 Py_DECREF(str_obj);
12656 return NULL;
12657 }
12658 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12659 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012660 Py_DECREF(str_obj);
12661 return NULL;
12662 }
12663
Victor Stinner14f8f022011-10-05 20:58:25 +020012664 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 len1 = PyUnicode_GET_LENGTH(str_obj);
12667 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012668 if (kind1 < kind2 || len1 < len2) {
12669 _Py_INCREF_UNICODE_EMPTY();
12670 if (!unicode_empty)
12671 out = NULL;
12672 else {
12673 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12674 Py_DECREF(unicode_empty);
12675 }
12676 Py_DECREF(sep_obj);
12677 Py_DECREF(str_obj);
12678 return out;
12679 }
12680 buf1 = PyUnicode_DATA(str_obj);
12681 buf2 = PyUnicode_DATA(sep_obj);
12682 if (kind2 != kind1) {
12683 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12684 if (!buf2)
12685 goto onError;
12686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012688 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012690 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12691 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12692 else
12693 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012694 break;
12695 case PyUnicode_2BYTE_KIND:
12696 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12697 break;
12698 case PyUnicode_4BYTE_KIND:
12699 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12700 break;
12701 default:
12702 assert(0);
12703 out = 0;
12704 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012705
12706 Py_DECREF(sep_obj);
12707 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012708 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012710
12711 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 onError:
12713 Py_DECREF(sep_obj);
12714 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012715 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 PyMem_Free(buf2);
12717 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012718}
12719
12720
12721PyObject *
12722PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12723{
12724 PyObject* str_obj;
12725 PyObject* sep_obj;
12726 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012727 int kind1, kind2;
12728 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012730
12731 str_obj = PyUnicode_FromObject(str_in);
12732 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012733 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012734 sep_obj = PyUnicode_FromObject(sep_in);
12735 if (!sep_obj) {
12736 Py_DECREF(str_obj);
12737 return NULL;
12738 }
12739
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012740 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 len1 = PyUnicode_GET_LENGTH(str_obj);
12743 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012744 if (kind1 < kind2 || len1 < len2) {
12745 _Py_INCREF_UNICODE_EMPTY();
12746 if (!unicode_empty)
12747 out = NULL;
12748 else {
12749 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12750 Py_DECREF(unicode_empty);
12751 }
12752 Py_DECREF(sep_obj);
12753 Py_DECREF(str_obj);
12754 return out;
12755 }
12756 buf1 = PyUnicode_DATA(str_obj);
12757 buf2 = PyUnicode_DATA(sep_obj);
12758 if (kind2 != kind1) {
12759 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12760 if (!buf2)
12761 goto onError;
12762 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012764 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012766 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12767 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12768 else
12769 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 break;
12771 case PyUnicode_2BYTE_KIND:
12772 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12773 break;
12774 case PyUnicode_4BYTE_KIND:
12775 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12776 break;
12777 default:
12778 assert(0);
12779 out = 0;
12780 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012781
12782 Py_DECREF(sep_obj);
12783 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012784 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012786
12787 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 onError:
12789 Py_DECREF(sep_obj);
12790 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012791 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 PyMem_Free(buf2);
12793 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012794}
12795
12796PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012798\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012799Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012800the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012801found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012802
12803static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012804unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012805{
Victor Stinner9310abb2011-10-05 00:59:23 +020012806 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012807}
12808
12809PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012810 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012811\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012812Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012814separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012815
12816static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012817unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818{
Victor Stinner9310abb2011-10-05 00:59:23 +020012819 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012820}
12821
Alexander Belopolsky40018472011-02-26 01:02:56 +000012822PyObject *
12823PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012824{
12825 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012826
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012827 s = PyUnicode_FromObject(s);
12828 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012829 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 if (sep != NULL) {
12831 sep = PyUnicode_FromObject(sep);
12832 if (sep == NULL) {
12833 Py_DECREF(s);
12834 return NULL;
12835 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012836 }
12837
Victor Stinner9310abb2011-10-05 00:59:23 +020012838 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012839
12840 Py_DECREF(s);
12841 Py_XDECREF(sep);
12842 return result;
12843}
12844
12845PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012846 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012847\n\
12848Return a list of the words in S, using sep as the\n\
12849delimiter string, starting at the end of the string and\n\
12850working to the front. If maxsplit is given, at most maxsplit\n\
12851splits are done. If sep is not specified, any whitespace string\n\
12852is a separator.");
12853
12854static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012855unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012856{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012857 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012858 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012859 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012860
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012861 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12862 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012863 return NULL;
12864
12865 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012867 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012868 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012869 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012870 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012871}
12872
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012873PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875\n\
12876Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012877Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012878is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879
12880static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012881unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012883 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012884 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012886 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12887 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888 return NULL;
12889
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012890 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891}
12892
12893static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012894PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012896 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897}
12898
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012899PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012900 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901\n\
12902Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012903and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012904
12905static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012906unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012907{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012908 if (PyUnicode_READY(self) == -1)
12909 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012910 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911}
12912
Larry Hastings61272b72014-01-07 12:41:53 -080012913/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012914
Larry Hastings31826802013-10-19 00:09:25 -070012915@staticmethod
12916str.maketrans as unicode_maketrans
12917
12918 x: object
12919
12920 y: unicode=NULL
12921
12922 z: unicode=NULL
12923
12924 /
12925
12926Return a translation table usable for str.translate().
12927
12928If there is only one argument, it must be a dictionary mapping Unicode
12929ordinals (integers) or characters to Unicode ordinals, strings or None.
12930Character keys will be then converted to ordinals.
12931If there are two arguments, they must be strings of equal length, and
12932in the resulting dictionary, each character in x will be mapped to the
12933character at the same position in y. If there is a third argument, it
12934must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012935[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012936
Larry Hastings31826802013-10-19 00:09:25 -070012937static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012938unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012939/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012940{
Georg Brandlceee0772007-11-27 23:48:05 +000012941 PyObject *new = NULL, *key, *value;
12942 Py_ssize_t i = 0;
12943 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012944
Georg Brandlceee0772007-11-27 23:48:05 +000012945 new = PyDict_New();
12946 if (!new)
12947 return NULL;
12948 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 int x_kind, y_kind, z_kind;
12950 void *x_data, *y_data, *z_data;
12951
Georg Brandlceee0772007-11-27 23:48:05 +000012952 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012953 if (!PyUnicode_Check(x)) {
12954 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12955 "be a string if there is a second argument");
12956 goto err;
12957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012958 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012959 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12960 "arguments must have equal length");
12961 goto err;
12962 }
12963 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 x_kind = PyUnicode_KIND(x);
12965 y_kind = PyUnicode_KIND(y);
12966 x_data = PyUnicode_DATA(x);
12967 y_data = PyUnicode_DATA(y);
12968 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12969 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012970 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012971 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012972 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012973 if (!value) {
12974 Py_DECREF(key);
12975 goto err;
12976 }
Georg Brandlceee0772007-11-27 23:48:05 +000012977 res = PyDict_SetItem(new, key, value);
12978 Py_DECREF(key);
12979 Py_DECREF(value);
12980 if (res < 0)
12981 goto err;
12982 }
12983 /* create entries for deleting chars in z */
12984 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 z_kind = PyUnicode_KIND(z);
12986 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012987 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012989 if (!key)
12990 goto err;
12991 res = PyDict_SetItem(new, key, Py_None);
12992 Py_DECREF(key);
12993 if (res < 0)
12994 goto err;
12995 }
12996 }
12997 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 int kind;
12999 void *data;
13000
Georg Brandlceee0772007-11-27 23:48:05 +000013001 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013002 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013003 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13004 "to maketrans it must be a dict");
13005 goto err;
13006 }
13007 /* copy entries into the new dict, converting string keys to int keys */
13008 while (PyDict_Next(x, &i, &key, &value)) {
13009 if (PyUnicode_Check(key)) {
13010 /* convert string keys to integer keys */
13011 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013012 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013013 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13014 "table must be of length 1");
13015 goto err;
13016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017 kind = PyUnicode_KIND(key);
13018 data = PyUnicode_DATA(key);
13019 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013020 if (!newkey)
13021 goto err;
13022 res = PyDict_SetItem(new, newkey, value);
13023 Py_DECREF(newkey);
13024 if (res < 0)
13025 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013026 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013027 /* just keep integer keys */
13028 if (PyDict_SetItem(new, key, value) < 0)
13029 goto err;
13030 } else {
13031 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13032 "be strings or integers");
13033 goto err;
13034 }
13035 }
13036 }
13037 return new;
13038 err:
13039 Py_DECREF(new);
13040 return NULL;
13041}
13042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013043PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013044 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013046Return a copy of the string S in which each character has been mapped\n\
13047through the given translation table. The table must implement\n\
13048lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13049mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13050this operation raises LookupError, the character is left untouched.\n\
13051Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013052
13053static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013055{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013056 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057}
13058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013059PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013060 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013061\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013062Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063
13064static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013065unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013067 if (PyUnicode_READY(self) == -1)
13068 return NULL;
13069 if (PyUnicode_IS_ASCII(self))
13070 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013071 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013072}
13073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013074PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013075 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013077Pad a numeric string S with zeros on the left, to fill a field\n\
13078of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079
13080static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013081unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013083 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013084 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013085 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013086 int kind;
13087 void *data;
13088 Py_UCS4 chr;
13089
Martin v. Löwis18e16552006-02-15 17:27:45 +000013090 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091 return NULL;
13092
Benjamin Petersonbac79492012-01-14 13:34:47 -050013093 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013094 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095
Victor Stinnerc4b49542011-12-11 22:44:26 +010013096 if (PyUnicode_GET_LENGTH(self) >= width)
13097 return unicode_result_unchanged(self);
13098
13099 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100
13101 u = pad(self, fill, 0, '0');
13102
Walter Dörwald068325e2002-04-15 13:36:47 +000013103 if (u == NULL)
13104 return NULL;
13105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 kind = PyUnicode_KIND(u);
13107 data = PyUnicode_DATA(u);
13108 chr = PyUnicode_READ(kind, data, fill);
13109
13110 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 PyUnicode_WRITE(kind, data, 0, chr);
13113 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114 }
13115
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013116 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013117 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119
13120#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013121static PyObject *
13122unicode__decimal2ascii(PyObject *self)
13123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013125}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126#endif
13127
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013128PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013129 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013131Return True if S starts with the specified prefix, False otherwise.\n\
13132With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013133With optional end, stop comparing S at that position.\n\
13134prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135
13136static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013137unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013138 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013140 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013141 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013142 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013143 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013144 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145
Jesus Ceaac451502011-04-20 17:09:23 +020013146 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013148 if (PyTuple_Check(subobj)) {
13149 Py_ssize_t i;
13150 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013151 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013152 if (substring == NULL)
13153 return NULL;
13154 result = tailmatch(self, substring, start, end, -1);
13155 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013156 if (result == -1)
13157 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013158 if (result) {
13159 Py_RETURN_TRUE;
13160 }
13161 }
13162 /* nothing matched */
13163 Py_RETURN_FALSE;
13164 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013165 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013166 if (substring == NULL) {
13167 if (PyErr_ExceptionMatches(PyExc_TypeError))
13168 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13169 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013170 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013171 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013172 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013173 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013174 if (result == -1)
13175 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013176 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177}
13178
13179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013180PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013181 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013183Return True if S ends with the specified suffix, False otherwise.\n\
13184With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013185With optional end, stop comparing S at that position.\n\
13186suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187
13188static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013189unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013190 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013192 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013193 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013194 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013195 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013196 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197
Jesus Ceaac451502011-04-20 17:09:23 +020013198 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013199 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013200 if (PyTuple_Check(subobj)) {
13201 Py_ssize_t i;
13202 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013203 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013204 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013205 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013206 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013207 result = tailmatch(self, substring, start, end, +1);
13208 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013209 if (result == -1)
13210 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013211 if (result) {
13212 Py_RETURN_TRUE;
13213 }
13214 }
13215 Py_RETURN_FALSE;
13216 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013217 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013218 if (substring == NULL) {
13219 if (PyErr_ExceptionMatches(PyExc_TypeError))
13220 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13221 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013222 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013223 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013224 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013225 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013226 if (result == -1)
13227 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013228 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013229}
13230
Victor Stinner202fdca2012-05-07 12:47:02 +020013231Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013232_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013233{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013234 if (!writer->readonly)
13235 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13236 else {
13237 /* Copy-on-write mode: set buffer size to 0 so
13238 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13239 * next write. */
13240 writer->size = 0;
13241 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013242 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13243 writer->data = PyUnicode_DATA(writer->buffer);
13244 writer->kind = PyUnicode_KIND(writer->buffer);
13245}
13246
Victor Stinnerd3f08822012-05-29 12:57:52 +020013247void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013248_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013249{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013250 memset(writer, 0, sizeof(*writer));
13251#ifdef Py_DEBUG
13252 writer->kind = 5; /* invalid kind */
13253#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013254 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013255}
13256
Victor Stinnerd3f08822012-05-29 12:57:52 +020013257int
13258_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13259 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013260{
Victor Stinner6989ba02013-11-18 21:08:39 +010013261#ifdef MS_WINDOWS
13262 /* On Windows, overallocate by 50% is the best factor */
13263# define OVERALLOCATE_FACTOR 2
13264#else
13265 /* On Linux, overallocate by 25% is the best factor */
13266# define OVERALLOCATE_FACTOR 4
13267#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013268 Py_ssize_t newlen;
13269 PyObject *newbuffer;
13270
Victor Stinnerd3f08822012-05-29 12:57:52 +020013271 assert(length > 0);
13272
Victor Stinner202fdca2012-05-07 12:47:02 +020013273 if (length > PY_SSIZE_T_MAX - writer->pos) {
13274 PyErr_NoMemory();
13275 return -1;
13276 }
13277 newlen = writer->pos + length;
13278
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013279 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013280
Victor Stinnerd3f08822012-05-29 12:57:52 +020013281 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013282 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013283 if (writer->overallocate
13284 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13285 /* overallocate to limit the number of realloc() */
13286 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013287 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013288 if (newlen < writer->min_length)
13289 newlen = writer->min_length;
13290
Victor Stinnerd3f08822012-05-29 12:57:52 +020013291 writer->buffer = PyUnicode_New(newlen, maxchar);
13292 if (writer->buffer == NULL)
13293 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013294 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013295 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013296 if (writer->overallocate
13297 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13298 /* overallocate to limit the number of realloc() */
13299 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013300 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013301 if (newlen < writer->min_length)
13302 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013303
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013304 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013305 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013306 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013307 newbuffer = PyUnicode_New(newlen, maxchar);
13308 if (newbuffer == NULL)
13309 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013310 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13311 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013312 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013313 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013314 }
13315 else {
13316 newbuffer = resize_compact(writer->buffer, newlen);
13317 if (newbuffer == NULL)
13318 return -1;
13319 }
13320 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013321 }
13322 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013323 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013324 newbuffer = PyUnicode_New(writer->size, maxchar);
13325 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013326 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013327 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13328 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013329 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013330 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013331 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013332 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013333
13334#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013335}
13336
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013337Py_LOCAL_INLINE(int)
13338_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013339{
13340 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13341 return -1;
13342 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13343 writer->pos++;
13344 return 0;
13345}
13346
13347int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013348_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13349{
13350 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13351}
13352
13353int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013354_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13355{
13356 Py_UCS4 maxchar;
13357 Py_ssize_t len;
13358
13359 if (PyUnicode_READY(str) == -1)
13360 return -1;
13361 len = PyUnicode_GET_LENGTH(str);
13362 if (len == 0)
13363 return 0;
13364 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13365 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013366 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013367 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013368 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013369 Py_INCREF(str);
13370 writer->buffer = str;
13371 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013372 writer->pos += len;
13373 return 0;
13374 }
13375 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13376 return -1;
13377 }
13378 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13379 str, 0, len);
13380 writer->pos += len;
13381 return 0;
13382}
13383
Victor Stinnere215d962012-10-06 23:03:36 +020013384int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013385_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13386 Py_ssize_t start, Py_ssize_t end)
13387{
13388 Py_UCS4 maxchar;
13389 Py_ssize_t len;
13390
13391 if (PyUnicode_READY(str) == -1)
13392 return -1;
13393
13394 assert(0 <= start);
13395 assert(end <= PyUnicode_GET_LENGTH(str));
13396 assert(start <= end);
13397
13398 if (end == 0)
13399 return 0;
13400
13401 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13402 return _PyUnicodeWriter_WriteStr(writer, str);
13403
13404 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13405 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13406 else
13407 maxchar = writer->maxchar;
13408 len = end - start;
13409
13410 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13411 return -1;
13412
13413 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13414 str, start, len);
13415 writer->pos += len;
13416 return 0;
13417}
13418
13419int
Victor Stinner4a587072013-11-19 12:54:53 +010013420_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13421 const char *ascii, Py_ssize_t len)
13422{
13423 if (len == -1)
13424 len = strlen(ascii);
13425
13426 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13427
13428 if (writer->buffer == NULL && !writer->overallocate) {
13429 PyObject *str;
13430
13431 str = _PyUnicode_FromASCII(ascii, len);
13432 if (str == NULL)
13433 return -1;
13434
13435 writer->readonly = 1;
13436 writer->buffer = str;
13437 _PyUnicodeWriter_Update(writer);
13438 writer->pos += len;
13439 return 0;
13440 }
13441
13442 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13443 return -1;
13444
13445 switch (writer->kind)
13446 {
13447 case PyUnicode_1BYTE_KIND:
13448 {
13449 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13450 Py_UCS1 *data = writer->data;
13451
13452 Py_MEMCPY(data + writer->pos, str, len);
13453 break;
13454 }
13455 case PyUnicode_2BYTE_KIND:
13456 {
13457 _PyUnicode_CONVERT_BYTES(
13458 Py_UCS1, Py_UCS2,
13459 ascii, ascii + len,
13460 (Py_UCS2 *)writer->data + writer->pos);
13461 break;
13462 }
13463 case PyUnicode_4BYTE_KIND:
13464 {
13465 _PyUnicode_CONVERT_BYTES(
13466 Py_UCS1, Py_UCS4,
13467 ascii, ascii + len,
13468 (Py_UCS4 *)writer->data + writer->pos);
13469 break;
13470 }
13471 default:
13472 assert(0);
13473 }
13474
13475 writer->pos += len;
13476 return 0;
13477}
13478
13479int
13480_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13481 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013482{
13483 Py_UCS4 maxchar;
13484
13485 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13486 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13487 return -1;
13488 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13489 writer->pos += len;
13490 return 0;
13491}
13492
Victor Stinnerd3f08822012-05-29 12:57:52 +020013493PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013494_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013495{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013496 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013497 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013498 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013499 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013500 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013501 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013502 str = writer->buffer;
13503 writer->buffer = NULL;
13504 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13505 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013506 }
13507 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13508 PyObject *newbuffer;
13509 newbuffer = resize_compact(writer->buffer, writer->pos);
13510 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013511 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013512 return NULL;
13513 }
13514 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013515 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013516 str = writer->buffer;
13517 writer->buffer = NULL;
13518 assert(_PyUnicode_CheckConsistency(str, 1));
13519 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013520}
13521
Victor Stinnerd3f08822012-05-29 12:57:52 +020013522void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013523_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013524{
13525 Py_CLEAR(writer->buffer);
13526}
13527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013528#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013529
13530PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013532\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013533Return a formatted version of S, using substitutions from args and kwargs.\n\
13534The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013535
Eric Smith27bbca62010-11-04 17:06:58 +000013536PyDoc_STRVAR(format_map__doc__,
13537 "S.format_map(mapping) -> str\n\
13538\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013539Return a formatted version of S, using substitutions from mapping.\n\
13540The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013541
Eric Smith4a7d76d2008-05-30 18:10:19 +000013542static PyObject *
13543unicode__format__(PyObject* self, PyObject* args)
13544{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013545 PyObject *format_spec;
13546 _PyUnicodeWriter writer;
13547 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013548
13549 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13550 return NULL;
13551
Victor Stinnerd3f08822012-05-29 12:57:52 +020013552 if (PyUnicode_READY(self) == -1)
13553 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013554 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013555 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13556 self, format_spec, 0,
13557 PyUnicode_GET_LENGTH(format_spec));
13558 if (ret == -1) {
13559 _PyUnicodeWriter_Dealloc(&writer);
13560 return NULL;
13561 }
13562 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013563}
13564
Eric Smith8c663262007-08-25 02:26:07 +000013565PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013566 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013567\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013568Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013569
13570static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013571unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013572{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013573 Py_ssize_t size;
13574
13575 /* If it's a compact object, account for base structure +
13576 character data. */
13577 if (PyUnicode_IS_COMPACT_ASCII(v))
13578 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13579 else if (PyUnicode_IS_COMPACT(v))
13580 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013581 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013582 else {
13583 /* If it is a two-block object, account for base object, and
13584 for character block if present. */
13585 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013586 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013587 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013588 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013589 }
13590 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013591 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013592 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013593 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013594 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013595 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013596
13597 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013598}
13599
13600PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013602
13603static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013604unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013605{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013606 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013607 if (!copy)
13608 return NULL;
13609 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013610}
13611
Guido van Rossumd57fd912000-03-10 22:53:23 +000013612static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013613 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013614 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013615 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13616 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013617 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13618 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013619 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013620 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13621 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13622 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013623 {"expandtabs", (PyCFunction) unicode_expandtabs,
13624 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013625 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013626 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013627 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13628 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13629 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013630 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013631 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13632 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13633 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013634 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013635 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013636 {"splitlines", (PyCFunction) unicode_splitlines,
13637 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013638 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013639 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13640 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13641 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13642 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13643 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13644 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13645 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13646 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13647 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13648 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13649 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13650 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13651 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13652 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013653 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013654 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013655 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013656 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013657 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013658 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013659 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013660 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013661#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013662 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013663 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013664#endif
13665
Benjamin Peterson14339b62009-01-31 16:36:08 +000013666 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013667 {NULL, NULL}
13668};
13669
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013670static PyObject *
13671unicode_mod(PyObject *v, PyObject *w)
13672{
Brian Curtindfc80e32011-08-10 20:28:54 -050013673 if (!PyUnicode_Check(v))
13674 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013675 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013676}
13677
13678static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013679 0, /*nb_add*/
13680 0, /*nb_subtract*/
13681 0, /*nb_multiply*/
13682 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013683};
13684
Guido van Rossumd57fd912000-03-10 22:53:23 +000013685static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013686 (lenfunc) unicode_length, /* sq_length */
13687 PyUnicode_Concat, /* sq_concat */
13688 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13689 (ssizeargfunc) unicode_getitem, /* sq_item */
13690 0, /* sq_slice */
13691 0, /* sq_ass_item */
13692 0, /* sq_ass_slice */
13693 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013694};
13695
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013696static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013697unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013699 if (PyUnicode_READY(self) == -1)
13700 return NULL;
13701
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013702 if (PyIndex_Check(item)) {
13703 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013704 if (i == -1 && PyErr_Occurred())
13705 return NULL;
13706 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013707 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013708 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013709 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013710 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013711 PyObject *result;
13712 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013713 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013714 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013716 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013717 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013718 return NULL;
13719 }
13720
13721 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013722 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013723 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013724 slicelength == PyUnicode_GET_LENGTH(self)) {
13725 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013726 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013727 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013728 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013729 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013730 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013731 src_kind = PyUnicode_KIND(self);
13732 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013733 if (!PyUnicode_IS_ASCII(self)) {
13734 kind_limit = kind_maxchar_limit(src_kind);
13735 max_char = 0;
13736 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13737 ch = PyUnicode_READ(src_kind, src_data, cur);
13738 if (ch > max_char) {
13739 max_char = ch;
13740 if (max_char >= kind_limit)
13741 break;
13742 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013743 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013744 }
Victor Stinner55c99112011-10-13 01:17:06 +020013745 else
13746 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013747 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013748 if (result == NULL)
13749 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013750 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013751 dest_data = PyUnicode_DATA(result);
13752
13753 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013754 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13755 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013756 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013757 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013758 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013759 } else {
13760 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13761 return NULL;
13762 }
13763}
13764
13765static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013766 (lenfunc)unicode_length, /* mp_length */
13767 (binaryfunc)unicode_subscript, /* mp_subscript */
13768 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013769};
13770
Guido van Rossumd57fd912000-03-10 22:53:23 +000013771
Guido van Rossumd57fd912000-03-10 22:53:23 +000013772/* Helpers for PyUnicode_Format() */
13773
Victor Stinnera47082312012-10-04 02:19:54 +020013774struct unicode_formatter_t {
13775 PyObject *args;
13776 int args_owned;
13777 Py_ssize_t arglen, argidx;
13778 PyObject *dict;
13779
13780 enum PyUnicode_Kind fmtkind;
13781 Py_ssize_t fmtcnt, fmtpos;
13782 void *fmtdata;
13783 PyObject *fmtstr;
13784
13785 _PyUnicodeWriter writer;
13786};
13787
13788struct unicode_format_arg_t {
13789 Py_UCS4 ch;
13790 int flags;
13791 Py_ssize_t width;
13792 int prec;
13793 int sign;
13794};
13795
Guido van Rossumd57fd912000-03-10 22:53:23 +000013796static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013797unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798{
Victor Stinnera47082312012-10-04 02:19:54 +020013799 Py_ssize_t argidx = ctx->argidx;
13800
13801 if (argidx < ctx->arglen) {
13802 ctx->argidx++;
13803 if (ctx->arglen < 0)
13804 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 else
Victor Stinnera47082312012-10-04 02:19:54 +020013806 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807 }
13808 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013809 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013810 return NULL;
13811}
13812
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013813/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013814
Victor Stinnera47082312012-10-04 02:19:54 +020013815/* Format a float into the writer if the writer is not NULL, or into *p_output
13816 otherwise.
13817
13818 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013819static int
Victor Stinnera47082312012-10-04 02:19:54 +020013820formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13821 PyObject **p_output,
13822 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013824 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013825 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013826 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013827 int prec;
13828 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013829
Guido van Rossumd57fd912000-03-10 22:53:23 +000013830 x = PyFloat_AsDouble(v);
13831 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013832 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013833
Victor Stinnera47082312012-10-04 02:19:54 +020013834 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013836 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013837
Victor Stinnera47082312012-10-04 02:19:54 +020013838 if (arg->flags & F_ALT)
13839 dtoa_flags = Py_DTSF_ALT;
13840 else
13841 dtoa_flags = 0;
13842 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013843 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013844 return -1;
13845 len = strlen(p);
13846 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013847 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013848 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013849 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013850 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013851 }
13852 else
13853 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013854 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013855 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013856}
13857
Victor Stinnerd0880d52012-04-27 23:40:13 +020013858/* formatlong() emulates the format codes d, u, o, x and X, and
13859 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13860 * Python's regular ints.
13861 * Return value: a new PyUnicodeObject*, or NULL if error.
13862 * The output string is of the form
13863 * "-"? ("0x" | "0X")? digit+
13864 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13865 * set in flags. The case of hex digits will be correct,
13866 * There will be at least prec digits, zero-filled on the left if
13867 * necessary to get that many.
13868 * val object to be converted
13869 * flags bitmask of format flags; only F_ALT is looked at
13870 * prec minimum number of digits; 0-fill on left if needed
13871 * type a character in [duoxX]; u acts the same as d
13872 *
13873 * CAUTION: o, x and X conversions on regular ints can never
13874 * produce a '-' sign, but can for Python's unbounded ints.
13875 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013876PyObject *
13877_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013878{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013879 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013880 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013881 Py_ssize_t i;
13882 int sign; /* 1 if '-', else 0 */
13883 int len; /* number of characters */
13884 Py_ssize_t llen;
13885 int numdigits; /* len == numnondigits + numdigits */
13886 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013887
Victor Stinnerd0880d52012-04-27 23:40:13 +020013888 /* Avoid exceeding SSIZE_T_MAX */
13889 if (prec > INT_MAX-3) {
13890 PyErr_SetString(PyExc_OverflowError,
13891 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013892 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013893 }
13894
13895 assert(PyLong_Check(val));
13896
13897 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013898 default:
13899 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013900 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013901 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013902 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013903 /* int and int subclasses should print numerically when a numeric */
13904 /* format code is used (see issue18780) */
13905 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013906 break;
13907 case 'o':
13908 numnondigits = 2;
13909 result = PyNumber_ToBase(val, 8);
13910 break;
13911 case 'x':
13912 case 'X':
13913 numnondigits = 2;
13914 result = PyNumber_ToBase(val, 16);
13915 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013916 }
13917 if (!result)
13918 return NULL;
13919
13920 assert(unicode_modifiable(result));
13921 assert(PyUnicode_IS_READY(result));
13922 assert(PyUnicode_IS_ASCII(result));
13923
13924 /* To modify the string in-place, there can only be one reference. */
13925 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013926 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013927 PyErr_BadInternalCall();
13928 return NULL;
13929 }
13930 buf = PyUnicode_DATA(result);
13931 llen = PyUnicode_GET_LENGTH(result);
13932 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013933 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013934 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013935 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013936 return NULL;
13937 }
13938 len = (int)llen;
13939 sign = buf[0] == '-';
13940 numnondigits += sign;
13941 numdigits = len - numnondigits;
13942 assert(numdigits > 0);
13943
13944 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013945 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013946 (type == 'o' || type == 'x' || type == 'X'))) {
13947 assert(buf[sign] == '0');
13948 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13949 buf[sign+1] == 'o');
13950 numnondigits -= 2;
13951 buf += 2;
13952 len -= 2;
13953 if (sign)
13954 buf[0] = '-';
13955 assert(len == numnondigits + numdigits);
13956 assert(numdigits > 0);
13957 }
13958
13959 /* Fill with leading zeroes to meet minimum width. */
13960 if (prec > numdigits) {
13961 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13962 numnondigits + prec);
13963 char *b1;
13964 if (!r1) {
13965 Py_DECREF(result);
13966 return NULL;
13967 }
13968 b1 = PyBytes_AS_STRING(r1);
13969 for (i = 0; i < numnondigits; ++i)
13970 *b1++ = *buf++;
13971 for (i = 0; i < prec - numdigits; i++)
13972 *b1++ = '0';
13973 for (i = 0; i < numdigits; i++)
13974 *b1++ = *buf++;
13975 *b1 = '\0';
13976 Py_DECREF(result);
13977 result = r1;
13978 buf = PyBytes_AS_STRING(result);
13979 len = numnondigits + prec;
13980 }
13981
13982 /* Fix up case for hex conversions. */
13983 if (type == 'X') {
13984 /* Need to convert all lower case letters to upper case.
13985 and need to convert 0x to 0X (and -0x to -0X). */
13986 for (i = 0; i < len; i++)
13987 if (buf[i] >= 'a' && buf[i] <= 'x')
13988 buf[i] -= 'a'-'A';
13989 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013990 if (!PyUnicode_Check(result)
13991 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013992 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013993 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013994 Py_DECREF(result);
13995 result = unicode;
13996 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013997 else if (len != PyUnicode_GET_LENGTH(result)) {
13998 if (PyUnicode_Resize(&result, len) < 0)
13999 Py_CLEAR(result);
14000 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014001 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014002}
14003
Ethan Furmandf3ed242014-01-05 06:50:30 -080014004/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014005 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014006 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014007 * -1 and raise an exception on error */
14008static int
Victor Stinnera47082312012-10-04 02:19:54 +020014009mainformatlong(PyObject *v,
14010 struct unicode_format_arg_t *arg,
14011 PyObject **p_output,
14012 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014013{
14014 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014015 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014016
14017 if (!PyNumber_Check(v))
14018 goto wrongtype;
14019
Ethan Furman9ab74802014-03-21 06:38:46 -070014020 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014021 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014022 if (type == 'o' || type == 'x' || type == 'X') {
14023 iobj = PyNumber_Index(v);
14024 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014025 if (PyErr_ExceptionMatches(PyExc_TypeError))
14026 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014027 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014028 }
14029 }
14030 else {
14031 iobj = PyNumber_Long(v);
14032 if (iobj == NULL ) {
14033 if (PyErr_ExceptionMatches(PyExc_TypeError))
14034 goto wrongtype;
14035 return -1;
14036 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014037 }
14038 assert(PyLong_Check(iobj));
14039 }
14040 else {
14041 iobj = v;
14042 Py_INCREF(iobj);
14043 }
14044
14045 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014046 && arg->width == -1 && arg->prec == -1
14047 && !(arg->flags & (F_SIGN | F_BLANK))
14048 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014049 {
14050 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014051 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014052 int base;
14053
Victor Stinnera47082312012-10-04 02:19:54 +020014054 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014055 {
14056 default:
14057 assert(0 && "'type' not in [diuoxX]");
14058 case 'd':
14059 case 'i':
14060 case 'u':
14061 base = 10;
14062 break;
14063 case 'o':
14064 base = 8;
14065 break;
14066 case 'x':
14067 case 'X':
14068 base = 16;
14069 break;
14070 }
14071
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014072 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14073 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014074 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014075 }
14076 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014077 return 1;
14078 }
14079
Ethan Furmanb95b5612015-01-23 20:05:18 -080014080 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014081 Py_DECREF(iobj);
14082 if (res == NULL)
14083 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014084 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014085 return 0;
14086
14087wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014088 switch(type)
14089 {
14090 case 'o':
14091 case 'x':
14092 case 'X':
14093 PyErr_Format(PyExc_TypeError,
14094 "%%%c format: an integer is required, "
14095 "not %.200s",
14096 type, Py_TYPE(v)->tp_name);
14097 break;
14098 default:
14099 PyErr_Format(PyExc_TypeError,
14100 "%%%c format: a number is required, "
14101 "not %.200s",
14102 type, Py_TYPE(v)->tp_name);
14103 break;
14104 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014105 return -1;
14106}
14107
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014108static Py_UCS4
14109formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014110{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014111 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014112 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014113 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014114 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014115 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014116 goto onError;
14117 }
14118 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014119 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014120 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014121 /* make sure number is a type of integer */
14122 if (!PyLong_Check(v)) {
14123 iobj = PyNumber_Index(v);
14124 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014125 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014126 }
14127 v = iobj;
14128 Py_DECREF(iobj);
14129 }
14130 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014131 x = PyLong_AsLong(v);
14132 if (x == -1 && PyErr_Occurred())
14133 goto onError;
14134
Victor Stinner8faf8212011-12-08 22:14:11 +010014135 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014136 PyErr_SetString(PyExc_OverflowError,
14137 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014138 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014139 }
14140
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014141 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014143
Benjamin Peterson29060642009-01-31 22:14:21 +000014144 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014145 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014146 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014147 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014148}
14149
Victor Stinnera47082312012-10-04 02:19:54 +020014150/* Parse options of an argument: flags, width, precision.
14151 Handle also "%(name)" syntax.
14152
14153 Return 0 if the argument has been formatted into arg->str.
14154 Return 1 if the argument has been written into ctx->writer,
14155 Raise an exception and return -1 on error. */
14156static int
14157unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14158 struct unicode_format_arg_t *arg)
14159{
14160#define FORMAT_READ(ctx) \
14161 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14162
14163 PyObject *v;
14164
Victor Stinnera47082312012-10-04 02:19:54 +020014165 if (arg->ch == '(') {
14166 /* Get argument value from a dictionary. Example: "%(name)s". */
14167 Py_ssize_t keystart;
14168 Py_ssize_t keylen;
14169 PyObject *key;
14170 int pcount = 1;
14171
14172 if (ctx->dict == NULL) {
14173 PyErr_SetString(PyExc_TypeError,
14174 "format requires a mapping");
14175 return -1;
14176 }
14177 ++ctx->fmtpos;
14178 --ctx->fmtcnt;
14179 keystart = ctx->fmtpos;
14180 /* Skip over balanced parentheses */
14181 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14182 arg->ch = FORMAT_READ(ctx);
14183 if (arg->ch == ')')
14184 --pcount;
14185 else if (arg->ch == '(')
14186 ++pcount;
14187 ctx->fmtpos++;
14188 }
14189 keylen = ctx->fmtpos - keystart - 1;
14190 if (ctx->fmtcnt < 0 || pcount > 0) {
14191 PyErr_SetString(PyExc_ValueError,
14192 "incomplete format key");
14193 return -1;
14194 }
14195 key = PyUnicode_Substring(ctx->fmtstr,
14196 keystart, keystart + keylen);
14197 if (key == NULL)
14198 return -1;
14199 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014200 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014201 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014202 }
14203 ctx->args = PyObject_GetItem(ctx->dict, key);
14204 Py_DECREF(key);
14205 if (ctx->args == NULL)
14206 return -1;
14207 ctx->args_owned = 1;
14208 ctx->arglen = -1;
14209 ctx->argidx = -2;
14210 }
14211
14212 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014213 while (--ctx->fmtcnt >= 0) {
14214 arg->ch = FORMAT_READ(ctx);
14215 ctx->fmtpos++;
14216 switch (arg->ch) {
14217 case '-': arg->flags |= F_LJUST; continue;
14218 case '+': arg->flags |= F_SIGN; continue;
14219 case ' ': arg->flags |= F_BLANK; continue;
14220 case '#': arg->flags |= F_ALT; continue;
14221 case '0': arg->flags |= F_ZERO; continue;
14222 }
14223 break;
14224 }
14225
14226 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014227 if (arg->ch == '*') {
14228 v = unicode_format_getnextarg(ctx);
14229 if (v == NULL)
14230 return -1;
14231 if (!PyLong_Check(v)) {
14232 PyErr_SetString(PyExc_TypeError,
14233 "* wants int");
14234 return -1;
14235 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014236 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014237 if (arg->width == -1 && PyErr_Occurred())
14238 return -1;
14239 if (arg->width < 0) {
14240 arg->flags |= F_LJUST;
14241 arg->width = -arg->width;
14242 }
14243 if (--ctx->fmtcnt >= 0) {
14244 arg->ch = FORMAT_READ(ctx);
14245 ctx->fmtpos++;
14246 }
14247 }
14248 else if (arg->ch >= '0' && arg->ch <= '9') {
14249 arg->width = arg->ch - '0';
14250 while (--ctx->fmtcnt >= 0) {
14251 arg->ch = FORMAT_READ(ctx);
14252 ctx->fmtpos++;
14253 if (arg->ch < '0' || arg->ch > '9')
14254 break;
14255 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14256 mixing signed and unsigned comparison. Since arg->ch is between
14257 '0' and '9', casting to int is safe. */
14258 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14259 PyErr_SetString(PyExc_ValueError,
14260 "width too big");
14261 return -1;
14262 }
14263 arg->width = arg->width*10 + (arg->ch - '0');
14264 }
14265 }
14266
14267 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014268 if (arg->ch == '.') {
14269 arg->prec = 0;
14270 if (--ctx->fmtcnt >= 0) {
14271 arg->ch = FORMAT_READ(ctx);
14272 ctx->fmtpos++;
14273 }
14274 if (arg->ch == '*') {
14275 v = unicode_format_getnextarg(ctx);
14276 if (v == NULL)
14277 return -1;
14278 if (!PyLong_Check(v)) {
14279 PyErr_SetString(PyExc_TypeError,
14280 "* wants int");
14281 return -1;
14282 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014283 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014284 if (arg->prec == -1 && PyErr_Occurred())
14285 return -1;
14286 if (arg->prec < 0)
14287 arg->prec = 0;
14288 if (--ctx->fmtcnt >= 0) {
14289 arg->ch = FORMAT_READ(ctx);
14290 ctx->fmtpos++;
14291 }
14292 }
14293 else if (arg->ch >= '0' && arg->ch <= '9') {
14294 arg->prec = arg->ch - '0';
14295 while (--ctx->fmtcnt >= 0) {
14296 arg->ch = FORMAT_READ(ctx);
14297 ctx->fmtpos++;
14298 if (arg->ch < '0' || arg->ch > '9')
14299 break;
14300 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14301 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014302 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014303 return -1;
14304 }
14305 arg->prec = arg->prec*10 + (arg->ch - '0');
14306 }
14307 }
14308 }
14309
14310 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14311 if (ctx->fmtcnt >= 0) {
14312 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14313 if (--ctx->fmtcnt >= 0) {
14314 arg->ch = FORMAT_READ(ctx);
14315 ctx->fmtpos++;
14316 }
14317 }
14318 }
14319 if (ctx->fmtcnt < 0) {
14320 PyErr_SetString(PyExc_ValueError,
14321 "incomplete format");
14322 return -1;
14323 }
14324 return 0;
14325
14326#undef FORMAT_READ
14327}
14328
14329/* Format one argument. Supported conversion specifiers:
14330
14331 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014332 - "i", "d", "u": int or float
14333 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014334 - "e", "E", "f", "F", "g", "G": float
14335 - "c": int or str (1 character)
14336
Victor Stinner8dbd4212012-12-04 09:30:24 +010014337 When possible, the output is written directly into the Unicode writer
14338 (ctx->writer). A string is created when padding is required.
14339
Victor Stinnera47082312012-10-04 02:19:54 +020014340 Return 0 if the argument has been formatted into *p_str,
14341 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014342 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014343static int
14344unicode_format_arg_format(struct unicode_formatter_t *ctx,
14345 struct unicode_format_arg_t *arg,
14346 PyObject **p_str)
14347{
14348 PyObject *v;
14349 _PyUnicodeWriter *writer = &ctx->writer;
14350
14351 if (ctx->fmtcnt == 0)
14352 ctx->writer.overallocate = 0;
14353
14354 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014355 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014356 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014357 return 1;
14358 }
14359
14360 v = unicode_format_getnextarg(ctx);
14361 if (v == NULL)
14362 return -1;
14363
Victor Stinnera47082312012-10-04 02:19:54 +020014364
14365 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014366 case 's':
14367 case 'r':
14368 case 'a':
14369 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14370 /* Fast path */
14371 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14372 return -1;
14373 return 1;
14374 }
14375
14376 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14377 *p_str = v;
14378 Py_INCREF(*p_str);
14379 }
14380 else {
14381 if (arg->ch == 's')
14382 *p_str = PyObject_Str(v);
14383 else if (arg->ch == 'r')
14384 *p_str = PyObject_Repr(v);
14385 else
14386 *p_str = PyObject_ASCII(v);
14387 }
14388 break;
14389
14390 case 'i':
14391 case 'd':
14392 case 'u':
14393 case 'o':
14394 case 'x':
14395 case 'X':
14396 {
14397 int ret = mainformatlong(v, arg, p_str, writer);
14398 if (ret != 0)
14399 return ret;
14400 arg->sign = 1;
14401 break;
14402 }
14403
14404 case 'e':
14405 case 'E':
14406 case 'f':
14407 case 'F':
14408 case 'g':
14409 case 'G':
14410 if (arg->width == -1 && arg->prec == -1
14411 && !(arg->flags & (F_SIGN | F_BLANK)))
14412 {
14413 /* Fast path */
14414 if (formatfloat(v, arg, NULL, writer) == -1)
14415 return -1;
14416 return 1;
14417 }
14418
14419 arg->sign = 1;
14420 if (formatfloat(v, arg, p_str, NULL) == -1)
14421 return -1;
14422 break;
14423
14424 case 'c':
14425 {
14426 Py_UCS4 ch = formatchar(v);
14427 if (ch == (Py_UCS4) -1)
14428 return -1;
14429 if (arg->width == -1 && arg->prec == -1) {
14430 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014431 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014432 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014433 return 1;
14434 }
14435 *p_str = PyUnicode_FromOrdinal(ch);
14436 break;
14437 }
14438
14439 default:
14440 PyErr_Format(PyExc_ValueError,
14441 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014442 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014443 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14444 (int)arg->ch,
14445 ctx->fmtpos - 1);
14446 return -1;
14447 }
14448 if (*p_str == NULL)
14449 return -1;
14450 assert (PyUnicode_Check(*p_str));
14451 return 0;
14452}
14453
14454static int
14455unicode_format_arg_output(struct unicode_formatter_t *ctx,
14456 struct unicode_format_arg_t *arg,
14457 PyObject *str)
14458{
14459 Py_ssize_t len;
14460 enum PyUnicode_Kind kind;
14461 void *pbuf;
14462 Py_ssize_t pindex;
14463 Py_UCS4 signchar;
14464 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014465 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014466 Py_ssize_t sublen;
14467 _PyUnicodeWriter *writer = &ctx->writer;
14468 Py_UCS4 fill;
14469
14470 fill = ' ';
14471 if (arg->sign && arg->flags & F_ZERO)
14472 fill = '0';
14473
14474 if (PyUnicode_READY(str) == -1)
14475 return -1;
14476
14477 len = PyUnicode_GET_LENGTH(str);
14478 if ((arg->width == -1 || arg->width <= len)
14479 && (arg->prec == -1 || arg->prec >= len)
14480 && !(arg->flags & (F_SIGN | F_BLANK)))
14481 {
14482 /* Fast path */
14483 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14484 return -1;
14485 return 0;
14486 }
14487
14488 /* Truncate the string for "s", "r" and "a" formats
14489 if the precision is set */
14490 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14491 if (arg->prec >= 0 && len > arg->prec)
14492 len = arg->prec;
14493 }
14494
14495 /* Adjust sign and width */
14496 kind = PyUnicode_KIND(str);
14497 pbuf = PyUnicode_DATA(str);
14498 pindex = 0;
14499 signchar = '\0';
14500 if (arg->sign) {
14501 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14502 if (ch == '-' || ch == '+') {
14503 signchar = ch;
14504 len--;
14505 pindex++;
14506 }
14507 else if (arg->flags & F_SIGN)
14508 signchar = '+';
14509 else if (arg->flags & F_BLANK)
14510 signchar = ' ';
14511 else
14512 arg->sign = 0;
14513 }
14514 if (arg->width < len)
14515 arg->width = len;
14516
14517 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014518 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014519 if (!(arg->flags & F_LJUST)) {
14520 if (arg->sign) {
14521 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014522 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014523 }
14524 else {
14525 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014526 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014527 }
14528 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014529 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14530 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014531 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014532 }
14533
Victor Stinnera47082312012-10-04 02:19:54 +020014534 buflen = arg->width;
14535 if (arg->sign && len == arg->width)
14536 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014537 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014538 return -1;
14539
14540 /* Write the sign if needed */
14541 if (arg->sign) {
14542 if (fill != ' ') {
14543 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14544 writer->pos += 1;
14545 }
14546 if (arg->width > len)
14547 arg->width--;
14548 }
14549
14550 /* Write the numeric prefix for "x", "X" and "o" formats
14551 if the alternate form is used.
14552 For example, write "0x" for the "%#x" format. */
14553 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14554 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14555 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14556 if (fill != ' ') {
14557 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14558 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14559 writer->pos += 2;
14560 pindex += 2;
14561 }
14562 arg->width -= 2;
14563 if (arg->width < 0)
14564 arg->width = 0;
14565 len -= 2;
14566 }
14567
14568 /* Pad left with the fill character if needed */
14569 if (arg->width > len && !(arg->flags & F_LJUST)) {
14570 sublen = arg->width - len;
14571 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14572 writer->pos += sublen;
14573 arg->width = len;
14574 }
14575
14576 /* If padding with spaces: write sign if needed and/or numeric prefix if
14577 the alternate form is used */
14578 if (fill == ' ') {
14579 if (arg->sign) {
14580 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14581 writer->pos += 1;
14582 }
14583 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14584 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14585 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14586 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14587 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14588 writer->pos += 2;
14589 pindex += 2;
14590 }
14591 }
14592
14593 /* Write characters */
14594 if (len) {
14595 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14596 str, pindex, len);
14597 writer->pos += len;
14598 }
14599
14600 /* Pad right with the fill character if needed */
14601 if (arg->width > len) {
14602 sublen = arg->width - len;
14603 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14604 writer->pos += sublen;
14605 }
14606 return 0;
14607}
14608
14609/* Helper of PyUnicode_Format(): format one arg.
14610 Return 0 on success, raise an exception and return -1 on error. */
14611static int
14612unicode_format_arg(struct unicode_formatter_t *ctx)
14613{
14614 struct unicode_format_arg_t arg;
14615 PyObject *str;
14616 int ret;
14617
Victor Stinner8dbd4212012-12-04 09:30:24 +010014618 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14619 arg.flags = 0;
14620 arg.width = -1;
14621 arg.prec = -1;
14622 arg.sign = 0;
14623 str = NULL;
14624
Victor Stinnera47082312012-10-04 02:19:54 +020014625 ret = unicode_format_arg_parse(ctx, &arg);
14626 if (ret == -1)
14627 return -1;
14628
14629 ret = unicode_format_arg_format(ctx, &arg, &str);
14630 if (ret == -1)
14631 return -1;
14632
14633 if (ret != 1) {
14634 ret = unicode_format_arg_output(ctx, &arg, str);
14635 Py_DECREF(str);
14636 if (ret == -1)
14637 return -1;
14638 }
14639
14640 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14641 PyErr_SetString(PyExc_TypeError,
14642 "not all arguments converted during string formatting");
14643 return -1;
14644 }
14645 return 0;
14646}
14647
Alexander Belopolsky40018472011-02-26 01:02:56 +000014648PyObject *
14649PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014650{
Victor Stinnera47082312012-10-04 02:19:54 +020014651 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014652
Guido van Rossumd57fd912000-03-10 22:53:23 +000014653 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014654 PyErr_BadInternalCall();
14655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014656 }
Victor Stinnera47082312012-10-04 02:19:54 +020014657
14658 ctx.fmtstr = PyUnicode_FromObject(format);
14659 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014660 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014661 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14662 Py_DECREF(ctx.fmtstr);
14663 return NULL;
14664 }
14665 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14666 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14667 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14668 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014669
Victor Stinner8f674cc2013-04-17 23:02:17 +020014670 _PyUnicodeWriter_Init(&ctx.writer);
14671 ctx.writer.min_length = ctx.fmtcnt + 100;
14672 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014673
Guido van Rossumd57fd912000-03-10 22:53:23 +000014674 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014675 ctx.arglen = PyTuple_Size(args);
14676 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014677 }
14678 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014679 ctx.arglen = -1;
14680 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014681 }
Victor Stinnera47082312012-10-04 02:19:54 +020014682 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014683 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014684 ctx.dict = args;
14685 else
14686 ctx.dict = NULL;
14687 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014688
Victor Stinnera47082312012-10-04 02:19:54 +020014689 while (--ctx.fmtcnt >= 0) {
14690 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014691 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014692
14693 nonfmtpos = ctx.fmtpos++;
14694 while (ctx.fmtcnt >= 0 &&
14695 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14696 ctx.fmtpos++;
14697 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014698 }
Victor Stinnera47082312012-10-04 02:19:54 +020014699 if (ctx.fmtcnt < 0) {
14700 ctx.fmtpos--;
14701 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014702 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014703
Victor Stinnercfc4c132013-04-03 01:48:39 +020014704 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14705 nonfmtpos, ctx.fmtpos) < 0)
14706 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014707 }
14708 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014709 ctx.fmtpos++;
14710 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014711 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014712 }
14713 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014714
Victor Stinnera47082312012-10-04 02:19:54 +020014715 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014716 PyErr_SetString(PyExc_TypeError,
14717 "not all arguments converted during string formatting");
14718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014719 }
14720
Victor Stinnera47082312012-10-04 02:19:54 +020014721 if (ctx.args_owned) {
14722 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014723 }
Victor Stinnera47082312012-10-04 02:19:54 +020014724 Py_DECREF(ctx.fmtstr);
14725 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014726
Benjamin Peterson29060642009-01-31 22:14:21 +000014727 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014728 Py_DECREF(ctx.fmtstr);
14729 _PyUnicodeWriter_Dealloc(&ctx.writer);
14730 if (ctx.args_owned) {
14731 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014732 }
14733 return NULL;
14734}
14735
Jeremy Hylton938ace62002-07-17 16:30:39 +000014736static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014737unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14738
Tim Peters6d6c1a32001-08-02 04:15:00 +000014739static PyObject *
14740unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14741{
Benjamin Peterson29060642009-01-31 22:14:21 +000014742 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014743 static char *kwlist[] = {"object", "encoding", "errors", 0};
14744 char *encoding = NULL;
14745 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014746
Benjamin Peterson14339b62009-01-31 16:36:08 +000014747 if (type != &PyUnicode_Type)
14748 return unicode_subtype_new(type, args, kwds);
14749 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014750 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014751 return NULL;
14752 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014753 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014754 if (encoding == NULL && errors == NULL)
14755 return PyObject_Str(x);
14756 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014757 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014758}
14759
Guido van Rossume023fe02001-08-30 03:12:59 +000014760static PyObject *
14761unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14762{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014763 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014764 Py_ssize_t length, char_size;
14765 int share_wstr, share_utf8;
14766 unsigned int kind;
14767 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014768
Benjamin Peterson14339b62009-01-31 16:36:08 +000014769 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014770
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014771 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014772 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014773 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014774 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014775 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014776 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014777 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014778 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014779
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014780 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014781 if (self == NULL) {
14782 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014783 return NULL;
14784 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014785 kind = PyUnicode_KIND(unicode);
14786 length = PyUnicode_GET_LENGTH(unicode);
14787
14788 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014789#ifdef Py_DEBUG
14790 _PyUnicode_HASH(self) = -1;
14791#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014792 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014793#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014794 _PyUnicode_STATE(self).interned = 0;
14795 _PyUnicode_STATE(self).kind = kind;
14796 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014797 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014798 _PyUnicode_STATE(self).ready = 1;
14799 _PyUnicode_WSTR(self) = NULL;
14800 _PyUnicode_UTF8_LENGTH(self) = 0;
14801 _PyUnicode_UTF8(self) = NULL;
14802 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014803 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014804
14805 share_utf8 = 0;
14806 share_wstr = 0;
14807 if (kind == PyUnicode_1BYTE_KIND) {
14808 char_size = 1;
14809 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14810 share_utf8 = 1;
14811 }
14812 else if (kind == PyUnicode_2BYTE_KIND) {
14813 char_size = 2;
14814 if (sizeof(wchar_t) == 2)
14815 share_wstr = 1;
14816 }
14817 else {
14818 assert(kind == PyUnicode_4BYTE_KIND);
14819 char_size = 4;
14820 if (sizeof(wchar_t) == 4)
14821 share_wstr = 1;
14822 }
14823
14824 /* Ensure we won't overflow the length. */
14825 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14826 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014827 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014828 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014829 data = PyObject_MALLOC((length + 1) * char_size);
14830 if (data == NULL) {
14831 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014832 goto onError;
14833 }
14834
Victor Stinnerc3c74152011-10-02 20:39:55 +020014835 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014836 if (share_utf8) {
14837 _PyUnicode_UTF8_LENGTH(self) = length;
14838 _PyUnicode_UTF8(self) = data;
14839 }
14840 if (share_wstr) {
14841 _PyUnicode_WSTR_LENGTH(self) = length;
14842 _PyUnicode_WSTR(self) = (wchar_t *)data;
14843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014844
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014845 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014846 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014847 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014848#ifdef Py_DEBUG
14849 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14850#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014851 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014852 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014853
14854onError:
14855 Py_DECREF(unicode);
14856 Py_DECREF(self);
14857 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014858}
14859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014860PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014861"str(object='') -> str\n\
14862str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014863\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014864Create a new string object from the given object. If encoding or\n\
14865errors is specified, then the object must expose a data buffer\n\
14866that will be decoded using the given encoding and error handler.\n\
14867Otherwise, returns the result of object.__str__() (if defined)\n\
14868or repr(object).\n\
14869encoding defaults to sys.getdefaultencoding().\n\
14870errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014871
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014872static PyObject *unicode_iter(PyObject *seq);
14873
Guido van Rossumd57fd912000-03-10 22:53:23 +000014874PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014875 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014876 "str", /* tp_name */
14877 sizeof(PyUnicodeObject), /* tp_size */
14878 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014879 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014880 (destructor)unicode_dealloc, /* tp_dealloc */
14881 0, /* tp_print */
14882 0, /* tp_getattr */
14883 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014884 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014885 unicode_repr, /* tp_repr */
14886 &unicode_as_number, /* tp_as_number */
14887 &unicode_as_sequence, /* tp_as_sequence */
14888 &unicode_as_mapping, /* tp_as_mapping */
14889 (hashfunc) unicode_hash, /* tp_hash*/
14890 0, /* tp_call*/
14891 (reprfunc) unicode_str, /* tp_str */
14892 PyObject_GenericGetAttr, /* tp_getattro */
14893 0, /* tp_setattro */
14894 0, /* tp_as_buffer */
14895 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014896 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014897 unicode_doc, /* tp_doc */
14898 0, /* tp_traverse */
14899 0, /* tp_clear */
14900 PyUnicode_RichCompare, /* tp_richcompare */
14901 0, /* tp_weaklistoffset */
14902 unicode_iter, /* tp_iter */
14903 0, /* tp_iternext */
14904 unicode_methods, /* tp_methods */
14905 0, /* tp_members */
14906 0, /* tp_getset */
14907 &PyBaseObject_Type, /* tp_base */
14908 0, /* tp_dict */
14909 0, /* tp_descr_get */
14910 0, /* tp_descr_set */
14911 0, /* tp_dictoffset */
14912 0, /* tp_init */
14913 0, /* tp_alloc */
14914 unicode_new, /* tp_new */
14915 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014916};
14917
14918/* Initialize the Unicode implementation */
14919
Victor Stinner3a50e702011-10-18 21:21:00 +020014920int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014921{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014922 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014923 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014924 0x000A, /* LINE FEED */
14925 0x000D, /* CARRIAGE RETURN */
14926 0x001C, /* FILE SEPARATOR */
14927 0x001D, /* GROUP SEPARATOR */
14928 0x001E, /* RECORD SEPARATOR */
14929 0x0085, /* NEXT LINE */
14930 0x2028, /* LINE SEPARATOR */
14931 0x2029, /* PARAGRAPH SEPARATOR */
14932 };
14933
Fred Drakee4315f52000-05-09 19:53:39 +000014934 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014935 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014936 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014937 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014938 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014939
Guido van Rossumcacfc072002-05-24 19:01:59 +000014940 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014941 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014942
14943 /* initialize the linebreak bloom filter */
14944 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014945 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014946 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014947
Christian Heimes26532f72013-07-20 14:57:16 +020014948 if (PyType_Ready(&EncodingMapType) < 0)
14949 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014950
Benjamin Petersonc4311282012-10-30 23:21:10 -040014951 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14952 Py_FatalError("Can't initialize field name iterator type");
14953
14954 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14955 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014956
Victor Stinner3a50e702011-10-18 21:21:00 +020014957 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014958}
14959
14960/* Finalize the Unicode implementation */
14961
Christian Heimesa156e092008-02-16 07:38:31 +000014962int
14963PyUnicode_ClearFreeList(void)
14964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014965 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014966}
14967
Guido van Rossumd57fd912000-03-10 22:53:23 +000014968void
Thomas Wouters78890102000-07-22 19:25:51 +000014969_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014970{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014971 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014972
Serhiy Storchaka05997252013-01-26 12:14:02 +020014973 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014974
Serhiy Storchaka05997252013-01-26 12:14:02 +020014975 for (i = 0; i < 256; i++)
14976 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014977 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014978 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014979}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014980
Walter Dörwald16807132007-05-25 13:52:07 +000014981void
14982PyUnicode_InternInPlace(PyObject **p)
14983{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014984 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014985 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014986#ifdef Py_DEBUG
14987 assert(s != NULL);
14988 assert(_PyUnicode_CHECK(s));
14989#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014990 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014991 return;
14992#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014993 /* If it's a subclass, we don't really know what putting
14994 it in the interned dict might do. */
14995 if (!PyUnicode_CheckExact(s))
14996 return;
14997 if (PyUnicode_CHECK_INTERNED(s))
14998 return;
14999 if (interned == NULL) {
15000 interned = PyDict_New();
15001 if (interned == NULL) {
15002 PyErr_Clear(); /* Don't leave an exception */
15003 return;
15004 }
15005 }
15006 /* It might be that the GetItem call fails even
15007 though the key is present in the dictionary,
15008 namely when this happens during a stack overflow. */
15009 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015010 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015011 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015012
Victor Stinnerf0335102013-04-14 19:13:03 +020015013 if (t) {
15014 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015015 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015016 return;
15017 }
Walter Dörwald16807132007-05-25 13:52:07 +000015018
Benjamin Peterson14339b62009-01-31 16:36:08 +000015019 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015020 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015021 PyErr_Clear();
15022 PyThreadState_GET()->recursion_critical = 0;
15023 return;
15024 }
15025 PyThreadState_GET()->recursion_critical = 0;
15026 /* The two references in interned are not counted by refcnt.
15027 The deallocator will take care of this */
15028 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015029 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015030}
15031
15032void
15033PyUnicode_InternImmortal(PyObject **p)
15034{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015035 PyUnicode_InternInPlace(p);
15036 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015037 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015038 Py_INCREF(*p);
15039 }
Walter Dörwald16807132007-05-25 13:52:07 +000015040}
15041
15042PyObject *
15043PyUnicode_InternFromString(const char *cp)
15044{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015045 PyObject *s = PyUnicode_FromString(cp);
15046 if (s == NULL)
15047 return NULL;
15048 PyUnicode_InternInPlace(&s);
15049 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015050}
15051
Alexander Belopolsky40018472011-02-26 01:02:56 +000015052void
15053_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015054{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015055 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015056 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015057 Py_ssize_t i, n;
15058 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015059
Benjamin Peterson14339b62009-01-31 16:36:08 +000015060 if (interned == NULL || !PyDict_Check(interned))
15061 return;
15062 keys = PyDict_Keys(interned);
15063 if (keys == NULL || !PyList_Check(keys)) {
15064 PyErr_Clear();
15065 return;
15066 }
Walter Dörwald16807132007-05-25 13:52:07 +000015067
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15069 detector, interned unicode strings are not forcibly deallocated;
15070 rather, we give them their stolen references back, and then clear
15071 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015072
Benjamin Peterson14339b62009-01-31 16:36:08 +000015073 n = PyList_GET_SIZE(keys);
15074 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015075 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015076 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015077 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015078 if (PyUnicode_READY(s) == -1) {
15079 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015080 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015082 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015083 case SSTATE_NOT_INTERNED:
15084 /* XXX Shouldn't happen */
15085 break;
15086 case SSTATE_INTERNED_IMMORTAL:
15087 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015088 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015089 break;
15090 case SSTATE_INTERNED_MORTAL:
15091 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015092 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015093 break;
15094 default:
15095 Py_FatalError("Inconsistent interned string state.");
15096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015097 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015098 }
15099 fprintf(stderr, "total size of all interned strings: "
15100 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15101 "mortal/immortal\n", mortal_size, immortal_size);
15102 Py_DECREF(keys);
15103 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015104 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015105}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015106
15107
15108/********************* Unicode Iterator **************************/
15109
15110typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015111 PyObject_HEAD
15112 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015113 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015114} unicodeiterobject;
15115
15116static void
15117unicodeiter_dealloc(unicodeiterobject *it)
15118{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 _PyObject_GC_UNTRACK(it);
15120 Py_XDECREF(it->it_seq);
15121 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015122}
15123
15124static int
15125unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15126{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015127 Py_VISIT(it->it_seq);
15128 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015129}
15130
15131static PyObject *
15132unicodeiter_next(unicodeiterobject *it)
15133{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015134 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015135
Benjamin Peterson14339b62009-01-31 16:36:08 +000015136 assert(it != NULL);
15137 seq = it->it_seq;
15138 if (seq == NULL)
15139 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015140 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015142 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15143 int kind = PyUnicode_KIND(seq);
15144 void *data = PyUnicode_DATA(seq);
15145 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15146 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015147 if (item != NULL)
15148 ++it->it_index;
15149 return item;
15150 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015151
Benjamin Peterson14339b62009-01-31 16:36:08 +000015152 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015153 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015154 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015155}
15156
15157static PyObject *
15158unicodeiter_len(unicodeiterobject *it)
15159{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015160 Py_ssize_t len = 0;
15161 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015162 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015163 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015164}
15165
15166PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15167
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015168static PyObject *
15169unicodeiter_reduce(unicodeiterobject *it)
15170{
15171 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015172 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015173 it->it_seq, it->it_index);
15174 } else {
15175 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15176 if (u == NULL)
15177 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015178 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015179 }
15180}
15181
15182PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15183
15184static PyObject *
15185unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15186{
15187 Py_ssize_t index = PyLong_AsSsize_t(state);
15188 if (index == -1 && PyErr_Occurred())
15189 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015190 if (it->it_seq != NULL) {
15191 if (index < 0)
15192 index = 0;
15193 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15194 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15195 it->it_index = index;
15196 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015197 Py_RETURN_NONE;
15198}
15199
15200PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15201
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015202static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015203 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015204 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015205 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15206 reduce_doc},
15207 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15208 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015209 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015210};
15211
15212PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015213 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15214 "str_iterator", /* tp_name */
15215 sizeof(unicodeiterobject), /* tp_basicsize */
15216 0, /* tp_itemsize */
15217 /* methods */
15218 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15219 0, /* tp_print */
15220 0, /* tp_getattr */
15221 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015222 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015223 0, /* tp_repr */
15224 0, /* tp_as_number */
15225 0, /* tp_as_sequence */
15226 0, /* tp_as_mapping */
15227 0, /* tp_hash */
15228 0, /* tp_call */
15229 0, /* tp_str */
15230 PyObject_GenericGetAttr, /* tp_getattro */
15231 0, /* tp_setattro */
15232 0, /* tp_as_buffer */
15233 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15234 0, /* tp_doc */
15235 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15236 0, /* tp_clear */
15237 0, /* tp_richcompare */
15238 0, /* tp_weaklistoffset */
15239 PyObject_SelfIter, /* tp_iter */
15240 (iternextfunc)unicodeiter_next, /* tp_iternext */
15241 unicodeiter_methods, /* tp_methods */
15242 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015243};
15244
15245static PyObject *
15246unicode_iter(PyObject *seq)
15247{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015249
Benjamin Peterson14339b62009-01-31 16:36:08 +000015250 if (!PyUnicode_Check(seq)) {
15251 PyErr_BadInternalCall();
15252 return NULL;
15253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015254 if (PyUnicode_READY(seq) == -1)
15255 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15257 if (it == NULL)
15258 return NULL;
15259 it->it_index = 0;
15260 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015261 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015262 _PyObject_GC_TRACK(it);
15263 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015264}
15265
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015266
15267size_t
15268Py_UNICODE_strlen(const Py_UNICODE *u)
15269{
15270 int res = 0;
15271 while(*u++)
15272 res++;
15273 return res;
15274}
15275
15276Py_UNICODE*
15277Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15278{
15279 Py_UNICODE *u = s1;
15280 while ((*u++ = *s2++));
15281 return s1;
15282}
15283
15284Py_UNICODE*
15285Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15286{
15287 Py_UNICODE *u = s1;
15288 while ((*u++ = *s2++))
15289 if (n-- == 0)
15290 break;
15291 return s1;
15292}
15293
15294Py_UNICODE*
15295Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15296{
15297 Py_UNICODE *u1 = s1;
15298 u1 += Py_UNICODE_strlen(u1);
15299 Py_UNICODE_strcpy(u1, s2);
15300 return s1;
15301}
15302
15303int
15304Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15305{
15306 while (*s1 && *s2 && *s1 == *s2)
15307 s1++, s2++;
15308 if (*s1 && *s2)
15309 return (*s1 < *s2) ? -1 : +1;
15310 if (*s1)
15311 return 1;
15312 if (*s2)
15313 return -1;
15314 return 0;
15315}
15316
15317int
15318Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15319{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015320 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015321 for (; n != 0; n--) {
15322 u1 = *s1;
15323 u2 = *s2;
15324 if (u1 != u2)
15325 return (u1 < u2) ? -1 : +1;
15326 if (u1 == '\0')
15327 return 0;
15328 s1++;
15329 s2++;
15330 }
15331 return 0;
15332}
15333
15334Py_UNICODE*
15335Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15336{
15337 const Py_UNICODE *p;
15338 for (p = s; *p; p++)
15339 if (*p == c)
15340 return (Py_UNICODE*)p;
15341 return NULL;
15342}
15343
15344Py_UNICODE*
15345Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15346{
15347 const Py_UNICODE *p;
15348 p = s + Py_UNICODE_strlen(s);
15349 while (p != s) {
15350 p--;
15351 if (*p == c)
15352 return (Py_UNICODE*)p;
15353 }
15354 return NULL;
15355}
Victor Stinner331ea922010-08-10 16:37:20 +000015356
Victor Stinner71133ff2010-09-01 23:43:53 +000015357Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015358PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015359{
Victor Stinner577db2c2011-10-11 22:12:48 +020015360 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015361 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015363 if (!PyUnicode_Check(unicode)) {
15364 PyErr_BadArgument();
15365 return NULL;
15366 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015367 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015368 if (u == NULL)
15369 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015370 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015371 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015372 PyErr_NoMemory();
15373 return NULL;
15374 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015375 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015376 size *= sizeof(Py_UNICODE);
15377 copy = PyMem_Malloc(size);
15378 if (copy == NULL) {
15379 PyErr_NoMemory();
15380 return NULL;
15381 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015382 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015383 return copy;
15384}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015385
Georg Brandl66c221e2010-10-14 07:04:07 +000015386/* A _string module, to export formatter_parser and formatter_field_name_split
15387 to the string.Formatter class implemented in Python. */
15388
15389static PyMethodDef _string_methods[] = {
15390 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15391 METH_O, PyDoc_STR("split the argument as a field name")},
15392 {"formatter_parser", (PyCFunction) formatter_parser,
15393 METH_O, PyDoc_STR("parse the argument as a format string")},
15394 {NULL, NULL}
15395};
15396
15397static struct PyModuleDef _string_module = {
15398 PyModuleDef_HEAD_INIT,
15399 "_string",
15400 PyDoc_STR("string helper module"),
15401 0,
15402 _string_methods,
15403 NULL,
15404 NULL,
15405 NULL,
15406 NULL
15407};
15408
15409PyMODINIT_FUNC
15410PyInit__string(void)
15411{
15412 return PyModule_Create(&_string_module);
15413}
15414
15415
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015416#ifdef __cplusplus
15417}
15418#endif