blob: eb126551115c4c3a4287513a5d19e61e31e11770 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000050/* --- Globals ------------------------------------------------------------
51
Serhiy Storchaka05997252013-01-26 12:14:02 +020052NOTE: In the interpreter's initialization phase, some globals are currently
53 initialized dynamically as needed. In the process Unicode objects may
54 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055
56*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000058
59#ifdef __cplusplus
60extern "C" {
61#endif
62
Victor Stinner8faf8212011-12-08 22:14:11 +010063/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
Victor Stinner910337b2011-10-03 03:20:16 +020066#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020067# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020068#else
69# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020071
Victor Stinnere90fe6a2011-10-01 16:48:13 +020072#define _PyUnicode_UTF8(op) \
73 (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020075 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020076 assert(PyUnicode_IS_READY(op)), \
77 PyUnicode_IS_COMPACT_ASCII(op) ? \
78 ((char*)((PyASCIIObject*)(op) + 1)) : \
79 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020080#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((PyASCIIObject*)(op))->length : \
87 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020088#define _PyUnicode_WSTR(op) \
89 (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op) \
91 (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op) \
93 (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op) \
95 (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op) \
97 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +020098#define _PyUnicode_KIND(op) \
99 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200100 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_GET_LENGTH(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200104#define _PyUnicode_DATA_ANY(op) \
105 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106
Victor Stinnere6abb482012-05-02 01:15:40 +0200107/* Optimized version of Py_MAX() to compute the maximum character:
108 use it when your are computing the second argument of PyUnicode_New() */
109#define MAX_MAXCHAR(maxchar1, maxchar2) \
110 ((maxchar1) | (maxchar2))
111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (!PyUnicode_IS_COMPACT_ASCII(op) \
132 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200133 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
Victor Stinner03490912011-10-03 23:45:12 +0200135/* true if the Unicode object has an allocated wstr memory block
136 (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (_PyUnicode_WSTR(op) && \
140 (!PyUnicode_IS_READY(op) || \
141 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
142
Victor Stinner910337b2011-10-03 03:20:16 +0200143/* Generic helper macro to convert characters of different types.
144 from_type and to_type have to be valid type names, begin and end
145 are pointers to the source characters which should be of type
146 "from_type *". to is a pointer of type "to_type *" and points to the
147 buffer where the result characters are written to. */
148#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
149 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200150 to_type *_to = (to_type *) to; \
151 const from_type *_iter = (begin); \
152 const from_type *_end = (end); \
153 Py_ssize_t n = (_end) - (_iter); \
154 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200155 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200156 while (_iter < (_unrolled_end)) { \
157 _to[0] = (to_type) _iter[0]; \
158 _to[1] = (to_type) _iter[1]; \
159 _to[2] = (to_type) _iter[2]; \
160 _to[3] = (to_type) _iter[3]; \
161 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200162 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_end)) \
164 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200165 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200166
Walter Dörwald16807132007-05-25 13:52:07 +0000167/* This dictionary holds all interned unicode strings. Note that references
168 to strings in this dictionary are *not* counted in the string's ob_refcnt.
169 When the interned string reaches a refcnt of 0 the string deallocation
170 function will delete the reference from this dictionary.
171
172 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000173 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000174*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200175static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000176
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000177/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179
Serhiy Storchaka678db842013-01-26 12:16:36 +0200180#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200181 do { \
182 if (unicode_empty != NULL) \
183 Py_INCREF(unicode_empty); \
184 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200185 unicode_empty = PyUnicode_New(0, 0); \
186 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200187 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
189 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200190 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200191 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000192
Serhiy Storchaka678db842013-01-26 12:16:36 +0200193#define _Py_RETURN_UNICODE_EMPTY() \
194 do { \
195 _Py_INCREF_UNICODE_EMPTY(); \
196 return unicode_empty; \
197 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000198
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200199/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200200static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000202/* Single character Unicode strings in the Latin-1 range are being
203 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200204static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000205
Christian Heimes190d79e2008-01-30 11:58:22 +0000206/* Fast detection of the most frequent whitespace characters */
207const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000209/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000210/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000C: * FORM FEED */
213/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000214 0, 1, 1, 1, 1, 1, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000216/* case 0x001C: * FILE SEPARATOR */
217/* case 0x001D: * GROUP SEPARATOR */
218/* case 0x001E: * RECORD SEPARATOR */
219/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000220 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000221/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 1, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000226
Benjamin Peterson14339b62009-01-31 16:36:08 +0000227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000235};
236
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200237/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200238static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100240static int unicode_modifiable(PyObject *unicode);
241
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100244_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200245static PyObject *
246_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
247static PyObject *
248_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
249
250static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000251unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000252 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100253 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
255
Alexander Belopolsky40018472011-02-26 01:02:56 +0000256static void
257raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300258 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100259 PyObject *unicode,
260 Py_ssize_t startpos, Py_ssize_t endpos,
261 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000262
Christian Heimes190d79e2008-01-30 11:58:22 +0000263/* Same for linebreaks */
264static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000267/* 0x000B, * LINE TABULATION */
268/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000269/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000270 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000271 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000272/* 0x001C, * FILE SEPARATOR */
273/* 0x001D, * GROUP SEPARATOR */
274/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 1, 1, 1, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000280
Benjamin Peterson14339b62009-01-31 16:36:08 +0000281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000289};
290
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300291/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
292 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000293Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000294PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000296#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000298#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 /* This is actually an illegal character, so it should
300 not be passed to unichr. */
301 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#endif
303}
304
Victor Stinner910337b2011-10-03 03:20:16 +0200305#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200306int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100307_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200308{
309 PyASCIIObject *ascii;
310 unsigned int kind;
311
312 assert(PyUnicode_Check(op));
313
314 ascii = (PyASCIIObject *)op;
315 kind = ascii->state.kind;
316
Victor Stinnera3b334d2011-10-03 13:53:37 +0200317 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200318 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
320 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200322 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200323 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200324
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 if (ascii->state.compact == 1) {
326 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200327 assert(kind == PyUnicode_1BYTE_KIND
328 || kind == PyUnicode_2BYTE_KIND
329 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 }
334 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
336
337 data = unicode->data.any;
338 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100339 assert(ascii->length == 0);
340 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ascii == 0);
343 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100344 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->wstr != NULL);
346 assert(data == NULL);
347 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200348 }
349 else {
350 assert(kind == PyUnicode_1BYTE_KIND
351 || kind == PyUnicode_2BYTE_KIND
352 || kind == PyUnicode_4BYTE_KIND);
353 assert(ascii->state.compact == 0);
354 assert(ascii->state.ready == 1);
355 assert(data != NULL);
356 if (ascii->state.ascii) {
357 assert (compact->utf8 == data);
358 assert (compact->utf8_length == ascii->length);
359 }
360 else
361 assert (compact->utf8 != data);
362 }
363 }
364 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200365 if (
366#if SIZEOF_WCHAR_T == 2
367 kind == PyUnicode_2BYTE_KIND
368#else
369 kind == PyUnicode_4BYTE_KIND
370#endif
371 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200372 {
373 assert(ascii->wstr == data);
374 assert(compact->wstr_length == ascii->length);
375 } else
376 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200377 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200378
379 if (compact->utf8 == NULL)
380 assert(compact->utf8_length == 0);
381 if (ascii->wstr == NULL)
382 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200383 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 /* check that the best kind is used */
385 if (check_content && kind != PyUnicode_WCHAR_KIND)
386 {
387 Py_ssize_t i;
388 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200389 void *data;
390 Py_UCS4 ch;
391
392 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200393 for (i=0; i < ascii->length; i++)
394 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200395 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 if (ch > maxchar)
397 maxchar = ch;
398 }
399 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100400 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 assert(maxchar <= 255);
403 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 else
405 assert(maxchar < 128);
406 }
Victor Stinner77faf692011-11-20 18:56:05 +0100407 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100409 assert(maxchar <= 0xFFFF);
410 }
411 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100413 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100414 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200415 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400417 return 1;
418}
Victor Stinner910337b2011-10-03 03:20:16 +0200419#endif
420
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100421static PyObject*
422unicode_result_wchar(PyObject *unicode)
423{
424#ifndef Py_DEBUG
425 Py_ssize_t len;
426
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100427 len = _PyUnicode_WSTR_LENGTH(unicode);
428 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200430 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 }
432
433 if (len == 1) {
434 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100435 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100436 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
437 Py_DECREF(unicode);
438 return latin1_char;
439 }
440 }
441
442 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200443 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100444 return NULL;
445 }
446#else
Victor Stinneraa771272012-10-04 02:32:58 +0200447 assert(Py_REFCNT(unicode) == 1);
448
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100449 /* don't make the result ready in debug mode to ensure that the caller
450 makes the string ready before using it */
451 assert(_PyUnicode_CheckConsistency(unicode, 1));
452#endif
453 return unicode;
454}
455
456static PyObject*
457unicode_result_ready(PyObject *unicode)
458{
459 Py_ssize_t length;
460
461 length = PyUnicode_GET_LENGTH(unicode);
462 if (length == 0) {
463 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100464 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200465 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 }
467 return unicode_empty;
468 }
469
470 if (length == 1) {
471 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
472 if (ch < 256) {
473 PyObject *latin1_char = unicode_latin1[ch];
474 if (latin1_char != NULL) {
475 if (unicode != latin1_char) {
476 Py_INCREF(latin1_char);
477 Py_DECREF(unicode);
478 }
479 return latin1_char;
480 }
481 else {
482 assert(_PyUnicode_CheckConsistency(unicode, 1));
483 Py_INCREF(unicode);
484 unicode_latin1[ch] = unicode;
485 return unicode;
486 }
487 }
488 }
489
490 assert(_PyUnicode_CheckConsistency(unicode, 1));
491 return unicode;
492}
493
494static PyObject*
495unicode_result(PyObject *unicode)
496{
497 assert(_PyUnicode_CHECK(unicode));
498 if (PyUnicode_IS_READY(unicode))
499 return unicode_result_ready(unicode);
500 else
501 return unicode_result_wchar(unicode);
502}
503
Victor Stinnerc4b49542011-12-11 22:44:26 +0100504static PyObject*
505unicode_result_unchanged(PyObject *unicode)
506{
507 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500508 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100509 return NULL;
510 Py_INCREF(unicode);
511 return unicode;
512 }
513 else
514 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100515 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100516}
517
Victor Stinner3a50e702011-10-18 21:21:00 +0200518#ifdef HAVE_MBCS
519static OSVERSIONINFOEX winver;
520#endif
521
Thomas Wouters477c8d52006-05-27 19:21:47 +0000522/* --- Bloom Filters ----------------------------------------------------- */
523
524/* stuff to implement simple "bloom filters" for Unicode characters.
525 to keep things simple, we use a single bitmask, using the least 5
526 bits from each unicode characters as the bit index. */
527
528/* the linebreak mask is set up by Unicode_Init below */
529
Antoine Pitrouf068f942010-01-13 14:19:12 +0000530#if LONG_BIT >= 128
531#define BLOOM_WIDTH 128
532#elif LONG_BIT >= 64
533#define BLOOM_WIDTH 64
534#elif LONG_BIT >= 32
535#define BLOOM_WIDTH 32
536#else
537#error "LONG_BIT is smaller than 32"
538#endif
539
Thomas Wouters477c8d52006-05-27 19:21:47 +0000540#define BLOOM_MASK unsigned long
541
Serhiy Storchaka05997252013-01-26 12:14:02 +0200542static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000543
Antoine Pitrouf068f942010-01-13 14:19:12 +0000544#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
545#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546
Benjamin Peterson29060642009-01-31 22:14:21 +0000547#define BLOOM_LINEBREAK(ch) \
548 ((ch) < 128U ? ascii_linebreak[(ch)] : \
549 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000550
Alexander Belopolsky40018472011-02-26 01:02:56 +0000551Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200552make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553{
554 /* calculate simple bloom-style bitmask for a given unicode string */
555
Antoine Pitrouf068f942010-01-13 14:19:12 +0000556 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000557 Py_ssize_t i;
558
559 mask = 0;
560 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000562
563 return mask;
564}
565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200566#define BLOOM_MEMBER(mask, chr, str) \
567 (BLOOM(mask, chr) \
568 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200570/* Compilation of templated routines */
571
572#include "stringlib/asciilib.h"
573#include "stringlib/fastsearch.h"
574#include "stringlib/partition.h"
575#include "stringlib/split.h"
576#include "stringlib/count.h"
577#include "stringlib/find.h"
578#include "stringlib/find_max_char.h"
579#include "stringlib/localeutil.h"
580#include "stringlib/undef.h"
581
582#include "stringlib/ucs1lib.h"
583#include "stringlib/fastsearch.h"
584#include "stringlib/partition.h"
585#include "stringlib/split.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
588#include "stringlib/find_max_char.h"
589#include "stringlib/localeutil.h"
590#include "stringlib/undef.h"
591
592#include "stringlib/ucs2lib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs4lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
608#include "stringlib/find_max_char.h"
609#include "stringlib/localeutil.h"
610#include "stringlib/undef.h"
611
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612#include "stringlib/unicodedefs.h"
613#include "stringlib/fastsearch.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100616#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200617
Guido van Rossumd57fd912000-03-10 22:53:23 +0000618/* --- Unicode Object ----------------------------------------------------- */
619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200620static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200621fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200622
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200623Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
624 Py_ssize_t size, Py_UCS4 ch,
625 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200626{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200627 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
628
629 switch (kind) {
630 case PyUnicode_1BYTE_KIND:
631 {
632 Py_UCS1 ch1 = (Py_UCS1) ch;
633 if (ch1 == ch)
634 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
635 else
636 return -1;
637 }
638 case PyUnicode_2BYTE_KIND:
639 {
640 Py_UCS2 ch2 = (Py_UCS2) ch;
641 if (ch2 == ch)
642 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
643 else
644 return -1;
645 }
646 case PyUnicode_4BYTE_KIND:
647 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
648 default:
649 assert(0);
650 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200652}
653
Victor Stinnerafffce42012-10-03 23:03:17 +0200654#ifdef Py_DEBUG
655/* Fill the data of an Unicode string with invalid characters to detect bugs
656 earlier.
657
658 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
659 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
660 invalid character in Unicode 6.0. */
661static void
662unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
663{
664 int kind = PyUnicode_KIND(unicode);
665 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
666 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
667 if (length <= old_length)
668 return;
669 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
670}
671#endif
672
Victor Stinnerfe226c02011-10-03 03:52:20 +0200673static PyObject*
674resize_compact(PyObject *unicode, Py_ssize_t length)
675{
676 Py_ssize_t char_size;
677 Py_ssize_t struct_size;
678 Py_ssize_t new_size;
679 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100680 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200681#ifdef Py_DEBUG
682 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
683#endif
684
Victor Stinner79891572012-05-03 13:43:07 +0200685 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100687 assert(PyUnicode_IS_COMPACT(unicode));
688
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200689 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100690 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200691 struct_size = sizeof(PyASCIIObject);
692 else
693 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200694 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
697 PyErr_NoMemory();
698 return NULL;
699 }
700 new_size = (struct_size + (length + 1) * char_size);
701
Victor Stinner84def372011-12-11 20:04:56 +0100702 _Py_DEC_REFTOTAL;
703 _Py_ForgetReference(unicode);
704
705 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
706 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100707 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200708 PyErr_NoMemory();
709 return NULL;
710 }
Victor Stinner84def372011-12-11 20:04:56 +0100711 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200712 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100713
Victor Stinnerfe226c02011-10-03 03:52:20 +0200714 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200715 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100717 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 _PyUnicode_WSTR_LENGTH(unicode) = length;
719 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100720 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
721 PyObject_DEL(_PyUnicode_WSTR(unicode));
722 _PyUnicode_WSTR(unicode) = NULL;
723 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200724#ifdef Py_DEBUG
725 unicode_fill_invalid(unicode, old_length);
726#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200727 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
728 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200729 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730 return unicode;
731}
732
Alexander Belopolsky40018472011-02-26 01:02:56 +0000733static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200734resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735{
Victor Stinner95663112011-10-04 01:03:50 +0200736 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100737 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200738 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000740
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 if (PyUnicode_IS_READY(unicode)) {
742 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200743 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200745#ifdef Py_DEBUG
746 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
747#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200748
749 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200750 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200751 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
752 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753
754 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
755 PyErr_NoMemory();
756 return -1;
757 }
758 new_size = (length + 1) * char_size;
759
Victor Stinner7a9105a2011-12-12 00:13:42 +0100760 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
761 {
762 PyObject_DEL(_PyUnicode_UTF8(unicode));
763 _PyUnicode_UTF8(unicode) = NULL;
764 _PyUnicode_UTF8_LENGTH(unicode) = 0;
765 }
766
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 data = (PyObject *)PyObject_REALLOC(data, new_size);
768 if (data == NULL) {
769 PyErr_NoMemory();
770 return -1;
771 }
772 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200773 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200775 _PyUnicode_WSTR_LENGTH(unicode) = length;
776 }
777 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200778 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200779 _PyUnicode_UTF8_LENGTH(unicode) = length;
780 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 _PyUnicode_LENGTH(unicode) = length;
782 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200783#ifdef Py_DEBUG
784 unicode_fill_invalid(unicode, old_length);
785#endif
Victor Stinner95663112011-10-04 01:03:50 +0200786 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200787 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200790 }
Victor Stinner95663112011-10-04 01:03:50 +0200791 assert(_PyUnicode_WSTR(unicode) != NULL);
792
793 /* check for integer overflow */
794 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
795 PyErr_NoMemory();
796 return -1;
797 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100798 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200799 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100800 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200801 if (!wstr) {
802 PyErr_NoMemory();
803 return -1;
804 }
805 _PyUnicode_WSTR(unicode) = wstr;
806 _PyUnicode_WSTR(unicode)[length] = 0;
807 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200808 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809 return 0;
810}
811
Victor Stinnerfe226c02011-10-03 03:52:20 +0200812static PyObject*
813resize_copy(PyObject *unicode, Py_ssize_t length)
814{
815 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100816 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200817 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100818
Benjamin Petersonbac79492012-01-14 13:34:47 -0500819 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100820 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200821
822 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
823 if (copy == NULL)
824 return NULL;
825
826 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200827 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200828 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200829 }
830 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200831 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100832
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200833 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200834 if (w == NULL)
835 return NULL;
836 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
837 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200838 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
839 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200840 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200841 }
842}
843
Guido van Rossumd57fd912000-03-10 22:53:23 +0000844/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000845 Ux0000 terminated; some code (e.g. new_identifier)
846 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000847
848 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000849 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850
851*/
852
Alexander Belopolsky40018472011-02-26 01:02:56 +0000853static PyUnicodeObject *
854_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000855{
856 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200857 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000858
Thomas Wouters477c8d52006-05-27 19:21:47 +0000859 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000860 if (length == 0 && unicode_empty != NULL) {
861 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200862 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000863 }
864
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000865 /* Ensure we won't overflow the size. */
866 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
867 return (PyUnicodeObject *)PyErr_NoMemory();
868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200869 if (length < 0) {
870 PyErr_SetString(PyExc_SystemError,
871 "Negative size passed to _PyUnicode_New");
872 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 }
874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
876 if (unicode == NULL)
877 return NULL;
878 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
879 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
880 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100881 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000882 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100883 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000884 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885
Jeremy Hyltond8082792003-09-16 19:41:39 +0000886 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000887 * the caller fails before initializing str -- unicode_resize()
888 * reads str[0], and the Keep-Alive optimization can keep memory
889 * allocated for str alive across a call to unicode_dealloc(unicode).
890 * We don't want unicode_resize to read uninitialized memory in
891 * that case.
892 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200893 _PyUnicode_WSTR(unicode)[0] = 0;
894 _PyUnicode_WSTR(unicode)[length] = 0;
895 _PyUnicode_WSTR_LENGTH(unicode) = length;
896 _PyUnicode_HASH(unicode) = -1;
897 _PyUnicode_STATE(unicode).interned = 0;
898 _PyUnicode_STATE(unicode).kind = 0;
899 _PyUnicode_STATE(unicode).compact = 0;
900 _PyUnicode_STATE(unicode).ready = 0;
901 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200902 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200904 _PyUnicode_UTF8(unicode) = NULL;
905 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100906 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000907 return unicode;
908}
909
Victor Stinnerf42dc442011-10-02 23:33:16 +0200910static const char*
911unicode_kind_name(PyObject *unicode)
912{
Victor Stinner42dfd712011-10-03 14:41:45 +0200913 /* don't check consistency: unicode_kind_name() is called from
914 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200915 if (!PyUnicode_IS_COMPACT(unicode))
916 {
917 if (!PyUnicode_IS_READY(unicode))
918 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600919 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200920 {
921 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200922 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200923 return "legacy ascii";
924 else
925 return "legacy latin1";
926 case PyUnicode_2BYTE_KIND:
927 return "legacy UCS2";
928 case PyUnicode_4BYTE_KIND:
929 return "legacy UCS4";
930 default:
931 return "<legacy invalid kind>";
932 }
933 }
934 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600935 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200936 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200937 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938 return "ascii";
939 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200940 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200941 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200942 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200944 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200945 default:
946 return "<invalid compact kind>";
947 }
948}
949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200950#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951/* Functions wrapping macros for use in debugger */
952char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200953 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200954}
955
956void *_PyUnicode_compact_data(void *unicode) {
957 return _PyUnicode_COMPACT_DATA(unicode);
958}
959void *_PyUnicode_data(void *unicode){
960 printf("obj %p\n", unicode);
961 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
962 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
963 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
964 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
965 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
966 return PyUnicode_DATA(unicode);
967}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200968
969void
970_PyUnicode_Dump(PyObject *op)
971{
972 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200973 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
974 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
975 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200976
Victor Stinnera849a4b2011-10-03 12:12:11 +0200977 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200978 {
979 if (ascii->state.ascii)
980 data = (ascii + 1);
981 else
982 data = (compact + 1);
983 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200984 else
985 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200986 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
987
Victor Stinnera849a4b2011-10-03 12:12:11 +0200988 if (ascii->wstr == data)
989 printf("shared ");
990 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200991
Victor Stinnera3b334d2011-10-03 13:53:37 +0200992 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200993 printf(" (%zu), ", compact->wstr_length);
994 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
995 printf("shared ");
996 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200997 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200998 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200999}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000#endif
1001
1002PyObject *
1003PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1004{
1005 PyObject *obj;
1006 PyCompactUnicodeObject *unicode;
1007 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001008 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001009 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001010 Py_ssize_t char_size;
1011 Py_ssize_t struct_size;
1012
1013 /* Optimization for empty strings */
1014 if (size == 0 && unicode_empty != NULL) {
1015 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001016 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001017 }
1018
Victor Stinner9e9d6892011-10-04 01:02:02 +02001019 is_ascii = 0;
1020 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 struct_size = sizeof(PyCompactUnicodeObject);
1022 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001023 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001024 char_size = 1;
1025 is_ascii = 1;
1026 struct_size = sizeof(PyASCIIObject);
1027 }
1028 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001029 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 char_size = 1;
1031 }
1032 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001033 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034 char_size = 2;
1035 if (sizeof(wchar_t) == 2)
1036 is_sharing = 1;
1037 }
1038 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001039 if (maxchar > MAX_UNICODE) {
1040 PyErr_SetString(PyExc_SystemError,
1041 "invalid maximum character passed to PyUnicode_New");
1042 return NULL;
1043 }
Victor Stinner8f825062012-04-27 13:55:39 +02001044 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 char_size = 4;
1046 if (sizeof(wchar_t) == 4)
1047 is_sharing = 1;
1048 }
1049
1050 /* Ensure we won't overflow the size. */
1051 if (size < 0) {
1052 PyErr_SetString(PyExc_SystemError,
1053 "Negative size passed to PyUnicode_New");
1054 return NULL;
1055 }
1056 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1057 return PyErr_NoMemory();
1058
1059 /* Duplicated allocation code from _PyObject_New() instead of a call to
1060 * PyObject_New() so we are able to allocate space for the object and
1061 * it's data buffer.
1062 */
1063 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1064 if (obj == NULL)
1065 return PyErr_NoMemory();
1066 obj = PyObject_INIT(obj, &PyUnicode_Type);
1067 if (obj == NULL)
1068 return NULL;
1069
1070 unicode = (PyCompactUnicodeObject *)obj;
1071 if (is_ascii)
1072 data = ((PyASCIIObject*)obj) + 1;
1073 else
1074 data = unicode + 1;
1075 _PyUnicode_LENGTH(unicode) = size;
1076 _PyUnicode_HASH(unicode) = -1;
1077 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001078 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079 _PyUnicode_STATE(unicode).compact = 1;
1080 _PyUnicode_STATE(unicode).ready = 1;
1081 _PyUnicode_STATE(unicode).ascii = is_ascii;
1082 if (is_ascii) {
1083 ((char*)data)[size] = 0;
1084 _PyUnicode_WSTR(unicode) = NULL;
1085 }
Victor Stinner8f825062012-04-27 13:55:39 +02001086 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001087 ((char*)data)[size] = 0;
1088 _PyUnicode_WSTR(unicode) = NULL;
1089 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001091 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 else {
1094 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001095 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001096 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001098 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099 ((Py_UCS4*)data)[size] = 0;
1100 if (is_sharing) {
1101 _PyUnicode_WSTR_LENGTH(unicode) = size;
1102 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1103 }
1104 else {
1105 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1106 _PyUnicode_WSTR(unicode) = NULL;
1107 }
1108 }
Victor Stinner8f825062012-04-27 13:55:39 +02001109#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001110 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001111#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001112 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001113 return obj;
1114}
1115
1116#if SIZEOF_WCHAR_T == 2
1117/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1118 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001119 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120
1121 This function assumes that unicode can hold one more code point than wstr
1122 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001123static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001125 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126{
1127 const wchar_t *iter;
1128 Py_UCS4 *ucs4_out;
1129
Victor Stinner910337b2011-10-03 03:20:16 +02001130 assert(unicode != NULL);
1131 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1133 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1134
1135 for (iter = begin; iter < end; ) {
1136 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1137 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001138 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1139 && (iter+1) < end
1140 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 {
Victor Stinner551ac952011-11-29 22:58:13 +01001142 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 iter += 2;
1144 }
1145 else {
1146 *ucs4_out++ = *iter;
1147 iter++;
1148 }
1149 }
1150 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1151 _PyUnicode_GET_LENGTH(unicode)));
1152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153}
1154#endif
1155
Victor Stinnercd9950f2011-10-02 00:34:53 +02001156static int
Victor Stinner488fa492011-12-12 00:01:39 +01001157unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001158{
Victor Stinner488fa492011-12-12 00:01:39 +01001159 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001160 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001161 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001162 return -1;
1163 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001164 return 0;
1165}
1166
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001167static int
1168_copy_characters(PyObject *to, Py_ssize_t to_start,
1169 PyObject *from, Py_ssize_t from_start,
1170 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001172 unsigned int from_kind, to_kind;
1173 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174
Victor Stinneree4544c2012-05-09 22:24:08 +02001175 assert(0 <= how_many);
1176 assert(0 <= from_start);
1177 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001178 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001179 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001180 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181
Victor Stinnerd3f08822012-05-29 12:57:52 +02001182 assert(PyUnicode_Check(to));
1183 assert(PyUnicode_IS_READY(to));
1184 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1185
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001186 if (how_many == 0)
1187 return 0;
1188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001189 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001190 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001191 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001192 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001193
Victor Stinnerf1852262012-06-16 16:38:26 +02001194#ifdef Py_DEBUG
1195 if (!check_maxchar
1196 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1197 {
1198 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1199 Py_UCS4 ch;
1200 Py_ssize_t i;
1201 for (i=0; i < how_many; i++) {
1202 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1203 assert(ch <= to_maxchar);
1204 }
1205 }
1206#endif
1207
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001208 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001209 if (check_maxchar
1210 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1211 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001212 /* Writing Latin-1 characters into an ASCII string requires to
1213 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001214 Py_UCS4 max_char;
1215 max_char = ucs1lib_find_max_char(from_data,
1216 (Py_UCS1*)from_data + how_many);
1217 if (max_char >= 128)
1218 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001219 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001220 Py_MEMCPY((char*)to_data + to_kind * to_start,
1221 (char*)from_data + from_kind * from_start,
1222 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001223 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001224 else if (from_kind == PyUnicode_1BYTE_KIND
1225 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001226 {
1227 _PyUnicode_CONVERT_BYTES(
1228 Py_UCS1, Py_UCS2,
1229 PyUnicode_1BYTE_DATA(from) + from_start,
1230 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1231 PyUnicode_2BYTE_DATA(to) + to_start
1232 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001233 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001234 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001235 && to_kind == PyUnicode_4BYTE_KIND)
1236 {
1237 _PyUnicode_CONVERT_BYTES(
1238 Py_UCS1, Py_UCS4,
1239 PyUnicode_1BYTE_DATA(from) + from_start,
1240 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1241 PyUnicode_4BYTE_DATA(to) + to_start
1242 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001243 }
1244 else if (from_kind == PyUnicode_2BYTE_KIND
1245 && to_kind == PyUnicode_4BYTE_KIND)
1246 {
1247 _PyUnicode_CONVERT_BYTES(
1248 Py_UCS2, Py_UCS4,
1249 PyUnicode_2BYTE_DATA(from) + from_start,
1250 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1251 PyUnicode_4BYTE_DATA(to) + to_start
1252 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001253 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001254 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001255 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1256
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001257 if (!check_maxchar) {
1258 if (from_kind == PyUnicode_2BYTE_KIND
1259 && to_kind == PyUnicode_1BYTE_KIND)
1260 {
1261 _PyUnicode_CONVERT_BYTES(
1262 Py_UCS2, Py_UCS1,
1263 PyUnicode_2BYTE_DATA(from) + from_start,
1264 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1265 PyUnicode_1BYTE_DATA(to) + to_start
1266 );
1267 }
1268 else if (from_kind == PyUnicode_4BYTE_KIND
1269 && to_kind == PyUnicode_1BYTE_KIND)
1270 {
1271 _PyUnicode_CONVERT_BYTES(
1272 Py_UCS4, Py_UCS1,
1273 PyUnicode_4BYTE_DATA(from) + from_start,
1274 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1275 PyUnicode_1BYTE_DATA(to) + to_start
1276 );
1277 }
1278 else if (from_kind == PyUnicode_4BYTE_KIND
1279 && to_kind == PyUnicode_2BYTE_KIND)
1280 {
1281 _PyUnicode_CONVERT_BYTES(
1282 Py_UCS4, Py_UCS2,
1283 PyUnicode_4BYTE_DATA(from) + from_start,
1284 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1285 PyUnicode_2BYTE_DATA(to) + to_start
1286 );
1287 }
1288 else {
1289 assert(0);
1290 return -1;
1291 }
1292 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001293 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001294 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001295 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001296 Py_ssize_t i;
1297
Victor Stinnera0702ab2011-09-29 14:14:38 +02001298 for (i=0; i < how_many; i++) {
1299 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001300 if (ch > to_maxchar)
1301 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001302 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1303 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001304 }
1305 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001306 return 0;
1307}
1308
Victor Stinnerd3f08822012-05-29 12:57:52 +02001309void
1310_PyUnicode_FastCopyCharacters(
1311 PyObject *to, Py_ssize_t to_start,
1312 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001313{
1314 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1315}
1316
1317Py_ssize_t
1318PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1319 PyObject *from, Py_ssize_t from_start,
1320 Py_ssize_t how_many)
1321{
1322 int err;
1323
1324 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1325 PyErr_BadInternalCall();
1326 return -1;
1327 }
1328
Benjamin Petersonbac79492012-01-14 13:34:47 -05001329 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001330 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001331 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001332 return -1;
1333
Victor Stinnerd3f08822012-05-29 12:57:52 +02001334 if (from_start < 0) {
1335 PyErr_SetString(PyExc_IndexError, "string index out of range");
1336 return -1;
1337 }
1338 if (to_start < 0) {
1339 PyErr_SetString(PyExc_IndexError, "string index out of range");
1340 return -1;
1341 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001342 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1343 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1344 PyErr_Format(PyExc_SystemError,
1345 "Cannot write %zi characters at %zi "
1346 "in a string of %zi characters",
1347 how_many, to_start, PyUnicode_GET_LENGTH(to));
1348 return -1;
1349 }
1350
1351 if (how_many == 0)
1352 return 0;
1353
Victor Stinner488fa492011-12-12 00:01:39 +01001354 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001355 return -1;
1356
1357 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1358 if (err) {
1359 PyErr_Format(PyExc_SystemError,
1360 "Cannot copy %s characters "
1361 "into a string of %s characters",
1362 unicode_kind_name(from),
1363 unicode_kind_name(to));
1364 return -1;
1365 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001366 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367}
1368
Victor Stinner17222162011-09-28 22:15:37 +02001369/* Find the maximum code point and count the number of surrogate pairs so a
1370 correct string length can be computed before converting a string to UCS4.
1371 This function counts single surrogates as a character and not as a pair.
1372
1373 Return 0 on success, or -1 on error. */
1374static int
1375find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1376 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377{
1378 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001379 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380
Victor Stinnerc53be962011-10-02 21:33:54 +02001381 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 *num_surrogates = 0;
1383 *maxchar = 0;
1384
1385 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001387 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1388 && (iter+1) < end
1389 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1390 {
1391 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1392 ++(*num_surrogates);
1393 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001397 {
1398 ch = *iter;
1399 iter++;
1400 }
1401 if (ch > *maxchar) {
1402 *maxchar = ch;
1403 if (*maxchar > MAX_UNICODE) {
1404 PyErr_Format(PyExc_ValueError,
1405 "character U+%x is not in range [U+0000; U+10ffff]",
1406 ch);
1407 return -1;
1408 }
1409 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 }
1411 return 0;
1412}
1413
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001414int
1415_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416{
1417 wchar_t *end;
1418 Py_UCS4 maxchar = 0;
1419 Py_ssize_t num_surrogates;
1420#if SIZEOF_WCHAR_T == 2
1421 Py_ssize_t length_wo_surrogates;
1422#endif
1423
Georg Brandl7597add2011-10-05 16:36:47 +02001424 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001425 strings were created using _PyObject_New() and where no canonical
1426 representation (the str field) has been set yet aka strings
1427 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001428 assert(_PyUnicode_CHECK(unicode));
1429 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001431 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001433 /* Actually, it should neither be interned nor be anything else: */
1434 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001437 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001438 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440
1441 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001442 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1443 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 PyErr_NoMemory();
1445 return -1;
1446 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001447 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 _PyUnicode_WSTR(unicode), end,
1449 PyUnicode_1BYTE_DATA(unicode));
1450 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1451 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1452 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1453 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001454 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001455 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 }
1458 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001459 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 _PyUnicode_UTF8(unicode) = NULL;
1461 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 }
1463 PyObject_FREE(_PyUnicode_WSTR(unicode));
1464 _PyUnicode_WSTR(unicode) = NULL;
1465 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1466 }
1467 /* In this case we might have to convert down from 4-byte native
1468 wchar_t to 2-byte unicode. */
1469 else if (maxchar < 65536) {
1470 assert(num_surrogates == 0 &&
1471 "FindMaxCharAndNumSurrogatePairs() messed up");
1472
Victor Stinner506f5922011-09-28 22:34:18 +02001473#if SIZEOF_WCHAR_T == 2
1474 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001475 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001476 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1477 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1478 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001479 _PyUnicode_UTF8(unicode) = NULL;
1480 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001481#else
1482 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001484 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001485 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001486 PyErr_NoMemory();
1487 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 }
Victor Stinner506f5922011-09-28 22:34:18 +02001489 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1490 _PyUnicode_WSTR(unicode), end,
1491 PyUnicode_2BYTE_DATA(unicode));
1492 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1493 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1494 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001495 _PyUnicode_UTF8(unicode) = NULL;
1496 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001497 PyObject_FREE(_PyUnicode_WSTR(unicode));
1498 _PyUnicode_WSTR(unicode) = NULL;
1499 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1500#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501 }
1502 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1503 else {
1504#if SIZEOF_WCHAR_T == 2
1505 /* in case the native representation is 2-bytes, we need to allocate a
1506 new normalized 4-byte version. */
1507 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001508 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1509 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001510 PyErr_NoMemory();
1511 return -1;
1512 }
1513 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1514 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001515 _PyUnicode_UTF8(unicode) = NULL;
1516 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001517 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1518 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001519 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 PyObject_FREE(_PyUnicode_WSTR(unicode));
1521 _PyUnicode_WSTR(unicode) = NULL;
1522 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1523#else
1524 assert(num_surrogates == 0);
1525
Victor Stinnerc3c74152011-10-02 20:39:55 +02001526 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001528 _PyUnicode_UTF8(unicode) = NULL;
1529 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001530 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1531#endif
1532 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1533 }
1534 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001535 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 return 0;
1537}
1538
Alexander Belopolsky40018472011-02-26 01:02:56 +00001539static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001540unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541{
Walter Dörwald16807132007-05-25 13:52:07 +00001542 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001543 case SSTATE_NOT_INTERNED:
1544 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001545
Benjamin Peterson29060642009-01-31 22:14:21 +00001546 case SSTATE_INTERNED_MORTAL:
1547 /* revive dead object temporarily for DelItem */
1548 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001549 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001550 Py_FatalError(
1551 "deletion of interned string failed");
1552 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001553
Benjamin Peterson29060642009-01-31 22:14:21 +00001554 case SSTATE_INTERNED_IMMORTAL:
1555 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001556
Benjamin Peterson29060642009-01-31 22:14:21 +00001557 default:
1558 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001559 }
1560
Victor Stinner03490912011-10-03 23:45:12 +02001561 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001563 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001564 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001565 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1566 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001568 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569}
1570
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001571#ifdef Py_DEBUG
1572static int
1573unicode_is_singleton(PyObject *unicode)
1574{
1575 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1576 if (unicode == unicode_empty)
1577 return 1;
1578 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1579 {
1580 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1581 if (ch < 256 && unicode_latin1[ch] == unicode)
1582 return 1;
1583 }
1584 return 0;
1585}
1586#endif
1587
Alexander Belopolsky40018472011-02-26 01:02:56 +00001588static int
Victor Stinner488fa492011-12-12 00:01:39 +01001589unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001590{
Victor Stinner488fa492011-12-12 00:01:39 +01001591 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592 if (Py_REFCNT(unicode) != 1)
1593 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001594 if (_PyUnicode_HASH(unicode) != -1)
1595 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 if (PyUnicode_CHECK_INTERNED(unicode))
1597 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001598 if (!PyUnicode_CheckExact(unicode))
1599 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001600#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001601 /* singleton refcount is greater than 1 */
1602 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001603#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001604 return 1;
1605}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001606
Victor Stinnerfe226c02011-10-03 03:52:20 +02001607static int
1608unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1609{
1610 PyObject *unicode;
1611 Py_ssize_t old_length;
1612
1613 assert(p_unicode != NULL);
1614 unicode = *p_unicode;
1615
1616 assert(unicode != NULL);
1617 assert(PyUnicode_Check(unicode));
1618 assert(0 <= length);
1619
Victor Stinner910337b2011-10-03 03:20:16 +02001620 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001621 old_length = PyUnicode_WSTR_LENGTH(unicode);
1622 else
1623 old_length = PyUnicode_GET_LENGTH(unicode);
1624 if (old_length == length)
1625 return 0;
1626
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001627 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001628 _Py_INCREF_UNICODE_EMPTY();
1629 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001630 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001631 Py_DECREF(*p_unicode);
1632 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001633 return 0;
1634 }
1635
Victor Stinner488fa492011-12-12 00:01:39 +01001636 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001637 PyObject *copy = resize_copy(unicode, length);
1638 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001639 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001640 Py_DECREF(*p_unicode);
1641 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001642 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001643 }
1644
Victor Stinnerfe226c02011-10-03 03:52:20 +02001645 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001646 PyObject *new_unicode = resize_compact(unicode, length);
1647 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001648 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001649 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001650 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001651 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001652 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001653}
1654
Alexander Belopolsky40018472011-02-26 01:02:56 +00001655int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001656PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001657{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001658 PyObject *unicode;
1659 if (p_unicode == NULL) {
1660 PyErr_BadInternalCall();
1661 return -1;
1662 }
1663 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001664 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001665 {
1666 PyErr_BadInternalCall();
1667 return -1;
1668 }
1669 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001670}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001671
Victor Stinnerc5166102012-02-22 13:55:02 +01001672/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001673
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001674 WARNING: The function doesn't copy the terminating null character and
1675 doesn't check the maximum character (may write a latin1 character in an
1676 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001677static void
1678unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1679 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001680{
1681 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1682 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001683 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001684
1685 switch (kind) {
1686 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001687 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001688#ifdef Py_DEBUG
1689 if (PyUnicode_IS_ASCII(unicode)) {
1690 Py_UCS4 maxchar = ucs1lib_find_max_char(
1691 (const Py_UCS1*)str,
1692 (const Py_UCS1*)str + len);
1693 assert(maxchar < 128);
1694 }
1695#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001696 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001697 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001698 }
1699 case PyUnicode_2BYTE_KIND: {
1700 Py_UCS2 *start = (Py_UCS2 *)data + index;
1701 Py_UCS2 *ucs2 = start;
1702 assert(index <= PyUnicode_GET_LENGTH(unicode));
1703
Victor Stinner184252a2012-06-16 02:57:41 +02001704 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001705 *ucs2 = (Py_UCS2)*str;
1706
1707 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001708 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001709 }
1710 default: {
1711 Py_UCS4 *start = (Py_UCS4 *)data + index;
1712 Py_UCS4 *ucs4 = start;
1713 assert(kind == PyUnicode_4BYTE_KIND);
1714 assert(index <= PyUnicode_GET_LENGTH(unicode));
1715
Victor Stinner184252a2012-06-16 02:57:41 +02001716 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001717 *ucs4 = (Py_UCS4)*str;
1718
1719 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001720 }
1721 }
1722}
1723
1724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725static PyObject*
1726get_latin1_char(unsigned char ch)
1727{
Victor Stinnera464fc12011-10-02 20:39:30 +02001728 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001730 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 if (!unicode)
1732 return NULL;
1733 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001734 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 unicode_latin1[ch] = unicode;
1736 }
1737 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001738 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739}
1740
Alexander Belopolsky40018472011-02-26 01:02:56 +00001741PyObject *
1742PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001744 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 Py_UCS4 maxchar = 0;
1746 Py_ssize_t num_surrogates;
1747
1748 if (u == NULL)
1749 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001751 /* If the Unicode data is known at construction time, we can apply
1752 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001755 if (size == 0)
1756 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 /* Single character Unicode objects in the Latin-1 range are
1759 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001760 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 return get_latin1_char((unsigned char)*u);
1762
1763 /* If not empty and not single character, copy the Unicode data
1764 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001765 if (find_maxchar_surrogates(u, u + size,
1766 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 return NULL;
1768
Victor Stinner8faf8212011-12-08 22:14:11 +01001769 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770 if (!unicode)
1771 return NULL;
1772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 switch (PyUnicode_KIND(unicode)) {
1774 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001775 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1777 break;
1778 case PyUnicode_2BYTE_KIND:
1779#if Py_UNICODE_SIZE == 2
1780 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1781#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001782 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1784#endif
1785 break;
1786 case PyUnicode_4BYTE_KIND:
1787#if SIZEOF_WCHAR_T == 2
1788 /* This is the only case which has to process surrogates, thus
1789 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001790 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791#else
1792 assert(num_surrogates == 0);
1793 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1794#endif
1795 break;
1796 default:
1797 assert(0 && "Impossible state");
1798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001800 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801}
1802
Alexander Belopolsky40018472011-02-26 01:02:56 +00001803PyObject *
1804PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001805{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001806 if (size < 0) {
1807 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001809 return NULL;
1810 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001811 if (u != NULL)
1812 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1813 else
1814 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001815}
1816
Alexander Belopolsky40018472011-02-26 01:02:56 +00001817PyObject *
1818PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001819{
1820 size_t size = strlen(u);
1821 if (size > PY_SSIZE_T_MAX) {
1822 PyErr_SetString(PyExc_OverflowError, "input too long");
1823 return NULL;
1824 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001825 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001826}
1827
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001828PyObject *
1829_PyUnicode_FromId(_Py_Identifier *id)
1830{
1831 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001832 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1833 strlen(id->string),
1834 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001835 if (!id->object)
1836 return NULL;
1837 PyUnicode_InternInPlace(&id->object);
1838 assert(!id->next);
1839 id->next = static_strings;
1840 static_strings = id;
1841 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001842 return id->object;
1843}
1844
1845void
1846_PyUnicode_ClearStaticStrings()
1847{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001848 _Py_Identifier *tmp, *s = static_strings;
1849 while (s) {
1850 Py_DECREF(s->object);
1851 s->object = NULL;
1852 tmp = s->next;
1853 s->next = NULL;
1854 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001855 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001856 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001857}
1858
Benjamin Peterson0df54292012-03-26 14:50:32 -04001859/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001860
Victor Stinnerd3f08822012-05-29 12:57:52 +02001861PyObject*
1862_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001863{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001864 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001865 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001866 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001867#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001868 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001869#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001870 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001871 }
Victor Stinner785938e2011-12-11 20:09:03 +01001872 unicode = PyUnicode_New(size, 127);
1873 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001874 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001875 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1876 assert(_PyUnicode_CheckConsistency(unicode, 1));
1877 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001878}
1879
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001880static Py_UCS4
1881kind_maxchar_limit(unsigned int kind)
1882{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001883 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001884 case PyUnicode_1BYTE_KIND:
1885 return 0x80;
1886 case PyUnicode_2BYTE_KIND:
1887 return 0x100;
1888 case PyUnicode_4BYTE_KIND:
1889 return 0x10000;
1890 default:
1891 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001892 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001893 }
1894}
1895
Victor Stinnere6abb482012-05-02 01:15:40 +02001896Py_LOCAL_INLINE(Py_UCS4)
1897align_maxchar(Py_UCS4 maxchar)
1898{
1899 if (maxchar <= 127)
1900 return 127;
1901 else if (maxchar <= 255)
1902 return 255;
1903 else if (maxchar <= 65535)
1904 return 65535;
1905 else
1906 return MAX_UNICODE;
1907}
1908
Victor Stinner702c7342011-10-05 13:50:52 +02001909static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001910_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001913 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001914
Serhiy Storchaka678db842013-01-26 12:16:36 +02001915 if (size == 0)
1916 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001917 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001918 if (size == 1)
1919 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001920
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001921 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001922 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 if (!res)
1924 return NULL;
1925 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001926 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001928}
1929
Victor Stinnere57b1c02011-09-28 22:20:48 +02001930static PyObject*
1931_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932{
1933 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001934 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001935
Serhiy Storchaka678db842013-01-26 12:16:36 +02001936 if (size == 0)
1937 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001938 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001939 if (size == 1) {
1940 Py_UCS4 ch = u[0];
1941 if (ch < 256)
1942 return get_latin1_char((unsigned char)ch);
1943
1944 res = PyUnicode_New(1, ch);
1945 if (res == NULL)
1946 return NULL;
1947 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1948 assert(_PyUnicode_CheckConsistency(res, 1));
1949 return res;
1950 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001951
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001952 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001953 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 if (!res)
1955 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001956 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001958 else {
1959 _PyUnicode_CONVERT_BYTES(
1960 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1961 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001962 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 return res;
1964}
1965
Victor Stinnere57b1c02011-09-28 22:20:48 +02001966static PyObject*
1967_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968{
1969 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001970 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001971
Serhiy Storchaka678db842013-01-26 12:16:36 +02001972 if (size == 0)
1973 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001975 if (size == 1) {
1976 Py_UCS4 ch = u[0];
1977 if (ch < 256)
1978 return get_latin1_char((unsigned char)ch);
1979
1980 res = PyUnicode_New(1, ch);
1981 if (res == NULL)
1982 return NULL;
1983 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1984 assert(_PyUnicode_CheckConsistency(res, 1));
1985 return res;
1986 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001987
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001988 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 if (!res)
1991 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001992 if (max_char < 256)
1993 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1994 PyUnicode_1BYTE_DATA(res));
1995 else if (max_char < 0x10000)
1996 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1997 PyUnicode_2BYTE_DATA(res));
1998 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002000 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 return res;
2002}
2003
2004PyObject*
2005PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2006{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002007 if (size < 0) {
2008 PyErr_SetString(PyExc_ValueError, "size must be positive");
2009 return NULL;
2010 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002011 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002013 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002015 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002017 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002018 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002019 PyErr_SetString(PyExc_SystemError, "invalid kind");
2020 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022}
2023
Victor Stinnerece58de2012-04-23 23:36:38 +02002024Py_UCS4
2025_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2026{
2027 enum PyUnicode_Kind kind;
2028 void *startptr, *endptr;
2029
2030 assert(PyUnicode_IS_READY(unicode));
2031 assert(0 <= start);
2032 assert(end <= PyUnicode_GET_LENGTH(unicode));
2033 assert(start <= end);
2034
2035 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2036 return PyUnicode_MAX_CHAR_VALUE(unicode);
2037
2038 if (start == end)
2039 return 127;
2040
Victor Stinner94d558b2012-04-27 22:26:58 +02002041 if (PyUnicode_IS_ASCII(unicode))
2042 return 127;
2043
Victor Stinnerece58de2012-04-23 23:36:38 +02002044 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002045 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002046 endptr = (char *)startptr + end * kind;
2047 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002048 switch(kind) {
2049 case PyUnicode_1BYTE_KIND:
2050 return ucs1lib_find_max_char(startptr, endptr);
2051 case PyUnicode_2BYTE_KIND:
2052 return ucs2lib_find_max_char(startptr, endptr);
2053 case PyUnicode_4BYTE_KIND:
2054 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002055 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002056 assert(0);
2057 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002058 }
2059}
2060
Victor Stinner25a4b292011-10-06 12:31:55 +02002061/* Ensure that a string uses the most efficient storage, if it is not the
2062 case: create a new string with of the right kind. Write NULL into *p_unicode
2063 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002064static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002065unicode_adjust_maxchar(PyObject **p_unicode)
2066{
2067 PyObject *unicode, *copy;
2068 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002069 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002070 unsigned int kind;
2071
2072 assert(p_unicode != NULL);
2073 unicode = *p_unicode;
2074 assert(PyUnicode_IS_READY(unicode));
2075 if (PyUnicode_IS_ASCII(unicode))
2076 return;
2077
2078 len = PyUnicode_GET_LENGTH(unicode);
2079 kind = PyUnicode_KIND(unicode);
2080 if (kind == PyUnicode_1BYTE_KIND) {
2081 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002082 max_char = ucs1lib_find_max_char(u, u + len);
2083 if (max_char >= 128)
2084 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002085 }
2086 else if (kind == PyUnicode_2BYTE_KIND) {
2087 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002088 max_char = ucs2lib_find_max_char(u, u + len);
2089 if (max_char >= 256)
2090 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002091 }
2092 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002093 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002094 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002095 max_char = ucs4lib_find_max_char(u, u + len);
2096 if (max_char >= 0x10000)
2097 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002098 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002099 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002100 if (copy != NULL)
2101 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002102 Py_DECREF(unicode);
2103 *p_unicode = copy;
2104}
2105
Victor Stinner034f6cf2011-09-30 02:26:44 +02002106PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002107_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002108{
Victor Stinner87af4f22011-11-21 23:03:47 +01002109 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002110 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002111
Victor Stinner034f6cf2011-09-30 02:26:44 +02002112 if (!PyUnicode_Check(unicode)) {
2113 PyErr_BadInternalCall();
2114 return NULL;
2115 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002116 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002117 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002118
Victor Stinner87af4f22011-11-21 23:03:47 +01002119 length = PyUnicode_GET_LENGTH(unicode);
2120 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002121 if (!copy)
2122 return NULL;
2123 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2124
Victor Stinner87af4f22011-11-21 23:03:47 +01002125 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2126 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002127 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002128 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002129}
2130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131
Victor Stinnerbc603d12011-10-02 01:00:40 +02002132/* Widen Unicode objects to larger buffers. Don't write terminating null
2133 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134
2135void*
2136_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2137{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002138 Py_ssize_t len;
2139 void *result;
2140 unsigned int skind;
2141
Benjamin Petersonbac79492012-01-14 13:34:47 -05002142 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002143 return NULL;
2144
2145 len = PyUnicode_GET_LENGTH(s);
2146 skind = PyUnicode_KIND(s);
2147 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002148 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 return NULL;
2150 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002151 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002152 case PyUnicode_2BYTE_KIND:
2153 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2154 if (!result)
2155 return PyErr_NoMemory();
2156 assert(skind == PyUnicode_1BYTE_KIND);
2157 _PyUnicode_CONVERT_BYTES(
2158 Py_UCS1, Py_UCS2,
2159 PyUnicode_1BYTE_DATA(s),
2160 PyUnicode_1BYTE_DATA(s) + len,
2161 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163 case PyUnicode_4BYTE_KIND:
2164 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2165 if (!result)
2166 return PyErr_NoMemory();
2167 if (skind == PyUnicode_2BYTE_KIND) {
2168 _PyUnicode_CONVERT_BYTES(
2169 Py_UCS2, Py_UCS4,
2170 PyUnicode_2BYTE_DATA(s),
2171 PyUnicode_2BYTE_DATA(s) + len,
2172 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 else {
2175 assert(skind == PyUnicode_1BYTE_KIND);
2176 _PyUnicode_CONVERT_BYTES(
2177 Py_UCS1, Py_UCS4,
2178 PyUnicode_1BYTE_DATA(s),
2179 PyUnicode_1BYTE_DATA(s) + len,
2180 result);
2181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 default:
2184 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 }
Victor Stinner01698042011-10-04 00:04:26 +02002186 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 return NULL;
2188}
2189
2190static Py_UCS4*
2191as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2192 int copy_null)
2193{
2194 int kind;
2195 void *data;
2196 Py_ssize_t len, targetlen;
2197 if (PyUnicode_READY(string) == -1)
2198 return NULL;
2199 kind = PyUnicode_KIND(string);
2200 data = PyUnicode_DATA(string);
2201 len = PyUnicode_GET_LENGTH(string);
2202 targetlen = len;
2203 if (copy_null)
2204 targetlen++;
2205 if (!target) {
2206 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2207 PyErr_NoMemory();
2208 return NULL;
2209 }
2210 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2211 if (!target) {
2212 PyErr_NoMemory();
2213 return NULL;
2214 }
2215 }
2216 else {
2217 if (targetsize < targetlen) {
2218 PyErr_Format(PyExc_SystemError,
2219 "string is longer than the buffer");
2220 if (copy_null && 0 < targetsize)
2221 target[0] = 0;
2222 return NULL;
2223 }
2224 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002225 if (kind == PyUnicode_1BYTE_KIND) {
2226 Py_UCS1 *start = (Py_UCS1 *) data;
2227 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002229 else if (kind == PyUnicode_2BYTE_KIND) {
2230 Py_UCS2 *start = (Py_UCS2 *) data;
2231 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2232 }
2233 else {
2234 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 if (copy_null)
2238 target[len] = 0;
2239 return target;
2240}
2241
2242Py_UCS4*
2243PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2244 int copy_null)
2245{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002246 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 PyErr_BadInternalCall();
2248 return NULL;
2249 }
2250 return as_ucs4(string, target, targetsize, copy_null);
2251}
2252
2253Py_UCS4*
2254PyUnicode_AsUCS4Copy(PyObject *string)
2255{
2256 return as_ucs4(string, NULL, 0, 1);
2257}
2258
2259#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002260
Alexander Belopolsky40018472011-02-26 01:02:56 +00002261PyObject *
2262PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002266 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002267 PyErr_BadInternalCall();
2268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 }
2270
Martin v. Löwis790465f2008-04-05 20:41:37 +00002271 if (size == -1) {
2272 size = wcslen(w);
2273 }
2274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276}
2277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002279
Walter Dörwald346737f2007-05-31 10:44:43 +00002280static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002281makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002282 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002283{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 if (longflag)
2286 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002287 else if (longlongflag) {
2288 /* longlongflag should only ever be nonzero on machines with
2289 HAVE_LONG_LONG defined */
2290#ifdef HAVE_LONG_LONG
2291 char *f = PY_FORMAT_LONG_LONG;
2292 while (*f)
2293 *fmt++ = *f++;
2294#else
2295 /* we shouldn't ever get here */
2296 assert(0);
2297 *fmt++ = 'l';
2298#endif
2299 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002300 else if (size_tflag) {
2301 char *f = PY_FORMAT_SIZE_T;
2302 while (*f)
2303 *fmt++ = *f++;
2304 }
2305 *fmt++ = c;
2306 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002307}
2308
Victor Stinner15a11362012-10-06 23:48:20 +02002309/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002310 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2311 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2312#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002313
2314static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002315unicode_fromformat_arg(_PyUnicodeWriter *writer,
2316 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002317{
Victor Stinnere215d962012-10-06 23:03:36 +02002318 const char *p;
2319 Py_ssize_t len;
2320 int zeropad;
2321 int width;
2322 int precision;
2323 int longflag;
2324 int longlongflag;
2325 int size_tflag;
2326 int fill;
2327
2328 p = f;
2329 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002330 zeropad = 0;
2331 if (*f == '0') {
2332 zeropad = 1;
2333 f++;
2334 }
Victor Stinner96865452011-03-01 23:44:09 +00002335
2336 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002337 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002338 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002339 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2340 PyErr_SetString(PyExc_ValueError,
2341 "width too big");
2342 return NULL;
2343 }
Victor Stinnere215d962012-10-06 23:03:36 +02002344 width = (width*10) + (*f - '0');
2345 f++;
2346 }
Victor Stinner96865452011-03-01 23:44:09 +00002347 precision = 0;
2348 if (*f == '.') {
2349 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002350 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002351 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2352 PyErr_SetString(PyExc_ValueError,
2353 "precision too big");
2354 return NULL;
2355 }
Victor Stinnere215d962012-10-06 23:03:36 +02002356 precision = (precision*10) + (*f - '0');
2357 f++;
2358 }
Victor Stinner96865452011-03-01 23:44:09 +00002359 if (*f == '%') {
2360 /* "%.3%s" => f points to "3" */
2361 f--;
2362 }
2363 }
2364 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002365 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002366 f--;
2367 }
Victor Stinner96865452011-03-01 23:44:09 +00002368
2369 /* Handle %ld, %lu, %lld and %llu. */
2370 longflag = 0;
2371 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002372 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002373 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002374 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002375 longflag = 1;
2376 ++f;
2377 }
2378#ifdef HAVE_LONG_LONG
2379 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002380 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002381 longlongflag = 1;
2382 f += 2;
2383 }
2384#endif
2385 }
2386 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002387 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002388 size_tflag = 1;
2389 ++f;
2390 }
Victor Stinnere215d962012-10-06 23:03:36 +02002391
2392 if (f[1] == '\0')
2393 writer->overallocate = 0;
2394
2395 switch (*f) {
2396 case 'c':
2397 {
2398 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002399 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2400 PyErr_SetString(PyExc_ValueError,
2401 "character argument not in range(0x110000)");
2402 return NULL;
2403 }
Victor Stinnere215d962012-10-06 23:03:36 +02002404 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2405 return NULL;
2406 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2407 writer->pos++;
2408 break;
2409 }
2410
2411 case 'i':
2412 case 'd':
2413 case 'u':
2414 case 'x':
2415 {
2416 /* used by sprintf */
2417 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002418 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002419
2420 if (*f == 'u') {
2421 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2422
2423 if (longflag)
2424 len = sprintf(buffer, fmt,
2425 va_arg(*vargs, unsigned long));
2426#ifdef HAVE_LONG_LONG
2427 else if (longlongflag)
2428 len = sprintf(buffer, fmt,
2429 va_arg(*vargs, unsigned PY_LONG_LONG));
2430#endif
2431 else if (size_tflag)
2432 len = sprintf(buffer, fmt,
2433 va_arg(*vargs, size_t));
2434 else
2435 len = sprintf(buffer, fmt,
2436 va_arg(*vargs, unsigned int));
2437 }
2438 else if (*f == 'x') {
2439 makefmt(fmt, 0, 0, 0, 'x');
2440 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2441 }
2442 else {
2443 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2444
2445 if (longflag)
2446 len = sprintf(buffer, fmt,
2447 va_arg(*vargs, long));
2448#ifdef HAVE_LONG_LONG
2449 else if (longlongflag)
2450 len = sprintf(buffer, fmt,
2451 va_arg(*vargs, PY_LONG_LONG));
2452#endif
2453 else if (size_tflag)
2454 len = sprintf(buffer, fmt,
2455 va_arg(*vargs, Py_ssize_t));
2456 else
2457 len = sprintf(buffer, fmt,
2458 va_arg(*vargs, int));
2459 }
2460 assert(len >= 0);
2461
Victor Stinnere215d962012-10-06 23:03:36 +02002462 if (precision < len)
2463 precision = len;
2464 if (width > precision) {
2465 Py_UCS4 fillchar;
2466 fill = width - precision;
2467 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002468 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2469 return NULL;
2470 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2471 return NULL;
2472 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002473 }
Victor Stinner15a11362012-10-06 23:48:20 +02002474 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002475 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002476 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2477 return NULL;
2478 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2479 return NULL;
2480 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002481 }
Victor Stinner15a11362012-10-06 23:48:20 +02002482 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002483 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002484 break;
2485 }
2486
2487 case 'p':
2488 {
2489 char number[MAX_LONG_LONG_CHARS];
2490
2491 len = sprintf(number, "%p", va_arg(*vargs, void*));
2492 assert(len >= 0);
2493
2494 /* %p is ill-defined: ensure leading 0x. */
2495 if (number[1] == 'X')
2496 number[1] = 'x';
2497 else if (number[1] != 'x') {
2498 memmove(number + 2, number,
2499 strlen(number) + 1);
2500 number[0] = '0';
2501 number[1] = 'x';
2502 len += 2;
2503 }
2504
2505 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2506 return NULL;
2507 break;
2508 }
2509
2510 case 's':
2511 {
2512 /* UTF-8 */
2513 const char *s = va_arg(*vargs, const char*);
2514 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2515 if (!str)
2516 return NULL;
2517 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2518 Py_DECREF(str);
2519 return NULL;
2520 }
2521 Py_DECREF(str);
2522 break;
2523 }
2524
2525 case 'U':
2526 {
2527 PyObject *obj = va_arg(*vargs, PyObject *);
2528 assert(obj && _PyUnicode_CHECK(obj));
2529
2530 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2531 return NULL;
2532 break;
2533 }
2534
2535 case 'V':
2536 {
2537 PyObject *obj = va_arg(*vargs, PyObject *);
2538 const char *str = va_arg(*vargs, const char *);
2539 PyObject *str_obj;
2540 assert(obj || str);
2541 if (obj) {
2542 assert(_PyUnicode_CHECK(obj));
2543 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2544 return NULL;
2545 }
2546 else {
2547 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2548 if (!str_obj)
2549 return NULL;
2550 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2551 Py_DECREF(str_obj);
2552 return NULL;
2553 }
2554 Py_DECREF(str_obj);
2555 }
2556 break;
2557 }
2558
2559 case 'S':
2560 {
2561 PyObject *obj = va_arg(*vargs, PyObject *);
2562 PyObject *str;
2563 assert(obj);
2564 str = PyObject_Str(obj);
2565 if (!str)
2566 return NULL;
2567 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2568 Py_DECREF(str);
2569 return NULL;
2570 }
2571 Py_DECREF(str);
2572 break;
2573 }
2574
2575 case 'R':
2576 {
2577 PyObject *obj = va_arg(*vargs, PyObject *);
2578 PyObject *repr;
2579 assert(obj);
2580 repr = PyObject_Repr(obj);
2581 if (!repr)
2582 return NULL;
2583 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2584 Py_DECREF(repr);
2585 return NULL;
2586 }
2587 Py_DECREF(repr);
2588 break;
2589 }
2590
2591 case 'A':
2592 {
2593 PyObject *obj = va_arg(*vargs, PyObject *);
2594 PyObject *ascii;
2595 assert(obj);
2596 ascii = PyObject_ASCII(obj);
2597 if (!ascii)
2598 return NULL;
2599 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2600 Py_DECREF(ascii);
2601 return NULL;
2602 }
2603 Py_DECREF(ascii);
2604 break;
2605 }
2606
2607 case '%':
2608 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2609 return NULL;
2610 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2611 writer->pos++;
2612 break;
2613
2614 default:
2615 /* if we stumble upon an unknown formatting code, copy the rest
2616 of the format string to the output string. (we cannot just
2617 skip the code, since there's no way to know what's in the
2618 argument list) */
2619 len = strlen(p);
2620 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2621 return NULL;
2622 f = p+len;
2623 return f;
2624 }
2625
2626 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002627 return f;
2628}
2629
Walter Dörwaldd2034312007-05-18 16:29:38 +00002630PyObject *
2631PyUnicode_FromFormatV(const char *format, va_list vargs)
2632{
Victor Stinnere215d962012-10-06 23:03:36 +02002633 va_list vargs2;
2634 const char *f;
2635 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002636
Victor Stinnere215d962012-10-06 23:03:36 +02002637 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2638
2639 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2640 Copy it to be able to pass a reference to a subfunction. */
2641 Py_VA_COPY(vargs2, vargs);
2642
2643 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002644 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002645 f = unicode_fromformat_arg(&writer, f, &vargs2);
2646 if (f == NULL)
2647 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002650 const char *p;
2651 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002652
Victor Stinnere215d962012-10-06 23:03:36 +02002653 p = f;
2654 do
2655 {
2656 if ((unsigned char)*p > 127) {
2657 PyErr_Format(PyExc_ValueError,
2658 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2659 "string, got a non-ASCII byte: 0x%02x",
2660 (unsigned char)*p);
2661 return NULL;
2662 }
2663 p++;
2664 }
2665 while (*p != '\0' && *p != '%');
2666 len = p - f;
2667
2668 if (*p == '\0')
2669 writer.overallocate = 0;
2670 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2671 goto fail;
2672 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2673 writer.pos += len;
2674
2675 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 }
Victor Stinnere215d962012-10-06 23:03:36 +02002678 return _PyUnicodeWriter_Finish(&writer);
2679
2680 fail:
2681 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002683}
2684
Walter Dörwaldd2034312007-05-18 16:29:38 +00002685PyObject *
2686PyUnicode_FromFormat(const char *format, ...)
2687{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 PyObject* ret;
2689 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002690
2691#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002692 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002693#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002695#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 ret = PyUnicode_FromFormatV(format, vargs);
2697 va_end(vargs);
2698 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002699}
2700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701#ifdef HAVE_WCHAR_H
2702
Victor Stinner5593d8a2010-10-02 11:11:27 +00002703/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2704 convert a Unicode object to a wide character string.
2705
Victor Stinnerd88d9832011-09-06 02:00:05 +02002706 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002707 character) required to convert the unicode object. Ignore size argument.
2708
Victor Stinnerd88d9832011-09-06 02:00:05 +02002709 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002710 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002711 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002712static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002713unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002714 wchar_t *w,
2715 Py_ssize_t size)
2716{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002717 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 const wchar_t *wstr;
2719
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002720 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721 if (wstr == NULL)
2722 return -1;
2723
Victor Stinner5593d8a2010-10-02 11:11:27 +00002724 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725 if (size > res)
2726 size = res + 1;
2727 else
2728 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 return res;
2731 }
2732 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002733 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002734}
2735
2736Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002737PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002738 wchar_t *w,
2739 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740{
2741 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002742 PyErr_BadInternalCall();
2743 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002745 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002746}
2747
Victor Stinner137c34c2010-09-29 10:25:54 +00002748wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002749PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002750 Py_ssize_t *size)
2751{
2752 wchar_t* buffer;
2753 Py_ssize_t buflen;
2754
2755 if (unicode == NULL) {
2756 PyErr_BadInternalCall();
2757 return NULL;
2758 }
2759
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002760 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002761 if (buflen == -1)
2762 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002763 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002764 PyErr_NoMemory();
2765 return NULL;
2766 }
2767
Victor Stinner137c34c2010-09-29 10:25:54 +00002768 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2769 if (buffer == NULL) {
2770 PyErr_NoMemory();
2771 return NULL;
2772 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002773 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002774 if (buflen == -1) {
2775 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002777 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002778 if (size != NULL)
2779 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002780 return buffer;
2781}
2782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784
Alexander Belopolsky40018472011-02-26 01:02:56 +00002785PyObject *
2786PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002787{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002789 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002790 PyErr_SetString(PyExc_ValueError,
2791 "chr() arg not in range(0x110000)");
2792 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002793 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002794
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002795 if ((Py_UCS4)ordinal < 256)
2796 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798 v = PyUnicode_New(1, ordinal);
2799 if (v == NULL)
2800 return NULL;
2801 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002802 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002803 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002804}
2805
Alexander Belopolsky40018472011-02-26 01:02:56 +00002806PyObject *
2807PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002809 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002810 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002811 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002812 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002813 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002814 Py_INCREF(obj);
2815 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002816 }
2817 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002818 /* For a Unicode subtype that's not a Unicode object,
2819 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002820 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002821 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002822 PyErr_Format(PyExc_TypeError,
2823 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002824 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002825 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002826}
2827
Alexander Belopolsky40018472011-02-26 01:02:56 +00002828PyObject *
2829PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002830 const char *encoding,
2831 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002832{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002833 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002834 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002835
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002837 PyErr_BadInternalCall();
2838 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002840
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002841 /* Decoding bytes objects is the most common case and should be fast */
2842 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002843 if (PyBytes_GET_SIZE(obj) == 0)
2844 _Py_RETURN_UNICODE_EMPTY();
2845 v = PyUnicode_Decode(
2846 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2847 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002848 return v;
2849 }
2850
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002851 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002852 PyErr_SetString(PyExc_TypeError,
2853 "decoding str is not supported");
2854 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002855 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002856
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002857 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2858 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2859 PyErr_Format(PyExc_TypeError,
2860 "coercing to str: need bytes, bytearray "
2861 "or buffer-like object, %.80s found",
2862 Py_TYPE(obj)->tp_name);
2863 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002864 }
Tim Petersced69f82003-09-16 20:30:58 +00002865
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002866 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002867 PyBuffer_Release(&buffer);
2868 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002870
Serhiy Storchaka05997252013-01-26 12:14:02 +02002871 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002872 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002873 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874}
2875
Victor Stinner600d3be2010-06-10 12:00:55 +00002876/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002877 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2878 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002879int
2880_Py_normalize_encoding(const char *encoding,
2881 char *lower,
2882 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002884 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002885 char *l;
2886 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002887
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002888 if (encoding == NULL) {
2889 strcpy(lower, "utf-8");
2890 return 1;
2891 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002892 e = encoding;
2893 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002894 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002895 while (*e) {
2896 if (l == l_end)
2897 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002898 if (Py_ISUPPER(*e)) {
2899 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002900 }
2901 else if (*e == '_') {
2902 *l++ = '-';
2903 e++;
2904 }
2905 else {
2906 *l++ = *e++;
2907 }
2908 }
2909 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002910 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002911}
2912
Alexander Belopolsky40018472011-02-26 01:02:56 +00002913PyObject *
2914PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002915 Py_ssize_t size,
2916 const char *encoding,
2917 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002918{
2919 PyObject *buffer = NULL, *unicode;
2920 Py_buffer info;
2921 char lower[11]; /* Enough for any encoding shortcut */
2922
Fred Drakee4315f52000-05-09 19:53:39 +00002923 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002924 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002925 if ((strcmp(lower, "utf-8") == 0) ||
2926 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002927 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002928 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002929 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002930 (strcmp(lower, "iso-8859-1") == 0))
2931 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002932#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002933 else if (strcmp(lower, "mbcs") == 0)
2934 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002935#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002936 else if (strcmp(lower, "ascii") == 0)
2937 return PyUnicode_DecodeASCII(s, size, errors);
2938 else if (strcmp(lower, "utf-16") == 0)
2939 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2940 else if (strcmp(lower, "utf-32") == 0)
2941 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2942 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943
2944 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002945 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002946 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002947 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002948 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002949 if (buffer == NULL)
2950 goto onError;
2951 unicode = PyCodec_Decode(buffer, encoding, errors);
2952 if (unicode == NULL)
2953 goto onError;
2954 if (!PyUnicode_Check(unicode)) {
2955 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002956 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002957 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958 Py_DECREF(unicode);
2959 goto onError;
2960 }
2961 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002962 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002963
Benjamin Peterson29060642009-01-31 22:14:21 +00002964 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 Py_XDECREF(buffer);
2966 return NULL;
2967}
2968
Alexander Belopolsky40018472011-02-26 01:02:56 +00002969PyObject *
2970PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002971 const char *encoding,
2972 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002973{
2974 PyObject *v;
2975
2976 if (!PyUnicode_Check(unicode)) {
2977 PyErr_BadArgument();
2978 goto onError;
2979 }
2980
2981 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002982 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002983
2984 /* Decode via the codec registry */
2985 v = PyCodec_Decode(unicode, encoding, errors);
2986 if (v == NULL)
2987 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002988 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002989
Benjamin Peterson29060642009-01-31 22:14:21 +00002990 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002991 return NULL;
2992}
2993
Alexander Belopolsky40018472011-02-26 01:02:56 +00002994PyObject *
2995PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002996 const char *encoding,
2997 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002998{
2999 PyObject *v;
3000
3001 if (!PyUnicode_Check(unicode)) {
3002 PyErr_BadArgument();
3003 goto onError;
3004 }
3005
3006 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003008
3009 /* Decode via the codec registry */
3010 v = PyCodec_Decode(unicode, encoding, errors);
3011 if (v == NULL)
3012 goto onError;
3013 if (!PyUnicode_Check(v)) {
3014 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003015 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003016 Py_TYPE(v)->tp_name);
3017 Py_DECREF(v);
3018 goto onError;
3019 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003020 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003021
Benjamin Peterson29060642009-01-31 22:14:21 +00003022 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003023 return NULL;
3024}
3025
Alexander Belopolsky40018472011-02-26 01:02:56 +00003026PyObject *
3027PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003028 Py_ssize_t size,
3029 const char *encoding,
3030 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031{
3032 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003033
Guido van Rossumd57fd912000-03-10 22:53:23 +00003034 unicode = PyUnicode_FromUnicode(s, size);
3035 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3038 Py_DECREF(unicode);
3039 return v;
3040}
3041
Alexander Belopolsky40018472011-02-26 01:02:56 +00003042PyObject *
3043PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003044 const char *encoding,
3045 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003046{
3047 PyObject *v;
3048
3049 if (!PyUnicode_Check(unicode)) {
3050 PyErr_BadArgument();
3051 goto onError;
3052 }
3053
3054 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003055 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003056
3057 /* Encode via the codec registry */
3058 v = PyCodec_Encode(unicode, encoding, errors);
3059 if (v == NULL)
3060 goto onError;
3061 return v;
3062
Benjamin Peterson29060642009-01-31 22:14:21 +00003063 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003064 return NULL;
3065}
3066
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003067static size_t
3068wcstombs_errorpos(const wchar_t *wstr)
3069{
3070 size_t len;
3071#if SIZEOF_WCHAR_T == 2
3072 wchar_t buf[3];
3073#else
3074 wchar_t buf[2];
3075#endif
3076 char outbuf[MB_LEN_MAX];
3077 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003078
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003079#if SIZEOF_WCHAR_T == 2
3080 buf[2] = 0;
3081#else
3082 buf[1] = 0;
3083#endif
3084 start = wstr;
3085 while (*wstr != L'\0')
3086 {
3087 previous = wstr;
3088#if SIZEOF_WCHAR_T == 2
3089 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3090 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3091 {
3092 buf[0] = wstr[0];
3093 buf[1] = wstr[1];
3094 wstr += 2;
3095 }
3096 else {
3097 buf[0] = *wstr;
3098 buf[1] = 0;
3099 wstr++;
3100 }
3101#else
3102 buf[0] = *wstr;
3103 wstr++;
3104#endif
3105 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003106 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003107 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003108 }
3109
3110 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003111 return 0;
3112}
3113
Victor Stinner1b579672011-12-17 05:47:23 +01003114static int
3115locale_error_handler(const char *errors, int *surrogateescape)
3116{
3117 if (errors == NULL) {
3118 *surrogateescape = 0;
3119 return 0;
3120 }
3121
3122 if (strcmp(errors, "strict") == 0) {
3123 *surrogateescape = 0;
3124 return 0;
3125 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003126 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003127 *surrogateescape = 1;
3128 return 0;
3129 }
3130 PyErr_Format(PyExc_ValueError,
3131 "only 'strict' and 'surrogateescape' error handlers "
3132 "are supported, not '%s'",
3133 errors);
3134 return -1;
3135}
3136
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003137PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003138PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003139{
3140 Py_ssize_t wlen, wlen2;
3141 wchar_t *wstr;
3142 PyObject *bytes = NULL;
3143 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003144 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003145 PyObject *exc;
3146 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003147 int surrogateescape;
3148
3149 if (locale_error_handler(errors, &surrogateescape) < 0)
3150 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003151
3152 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3153 if (wstr == NULL)
3154 return NULL;
3155
3156 wlen2 = wcslen(wstr);
3157 if (wlen2 != wlen) {
3158 PyMem_Free(wstr);
3159 PyErr_SetString(PyExc_TypeError, "embedded null character");
3160 return NULL;
3161 }
3162
3163 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003164 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003165 char *str;
3166
3167 str = _Py_wchar2char(wstr, &error_pos);
3168 if (str == NULL) {
3169 if (error_pos == (size_t)-1) {
3170 PyErr_NoMemory();
3171 PyMem_Free(wstr);
3172 return NULL;
3173 }
3174 else {
3175 goto encode_error;
3176 }
3177 }
3178 PyMem_Free(wstr);
3179
3180 bytes = PyBytes_FromString(str);
3181 PyMem_Free(str);
3182 }
3183 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003184 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003185 size_t len, len2;
3186
3187 len = wcstombs(NULL, wstr, 0);
3188 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003189 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003190 goto encode_error;
3191 }
3192
3193 bytes = PyBytes_FromStringAndSize(NULL, len);
3194 if (bytes == NULL) {
3195 PyMem_Free(wstr);
3196 return NULL;
3197 }
3198
3199 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3200 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003201 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003202 goto encode_error;
3203 }
3204 PyMem_Free(wstr);
3205 }
3206 return bytes;
3207
3208encode_error:
3209 errmsg = strerror(errno);
3210 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003211
3212 if (error_pos == (size_t)-1)
3213 error_pos = wcstombs_errorpos(wstr);
3214
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003215 PyMem_Free(wstr);
3216 Py_XDECREF(bytes);
3217
Victor Stinner2f197072011-12-17 07:08:30 +01003218 if (errmsg != NULL) {
3219 size_t errlen;
3220 wstr = _Py_char2wchar(errmsg, &errlen);
3221 if (wstr != NULL) {
3222 reason = PyUnicode_FromWideChar(wstr, errlen);
3223 PyMem_Free(wstr);
3224 } else
3225 errmsg = NULL;
3226 }
3227 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003228 reason = PyUnicode_FromString(
3229 "wcstombs() encountered an unencodable "
3230 "wide character");
3231 if (reason == NULL)
3232 return NULL;
3233
3234 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3235 "locale", unicode,
3236 (Py_ssize_t)error_pos,
3237 (Py_ssize_t)(error_pos+1),
3238 reason);
3239 Py_DECREF(reason);
3240 if (exc != NULL) {
3241 PyCodec_StrictErrors(exc);
3242 Py_XDECREF(exc);
3243 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003244 return NULL;
3245}
3246
Victor Stinnerad158722010-10-27 00:25:46 +00003247PyObject *
3248PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003249{
Victor Stinner99b95382011-07-04 14:23:54 +02003250#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003251 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003252#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003253 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003254#else
Victor Stinner793b5312011-04-27 00:24:21 +02003255 PyInterpreterState *interp = PyThreadState_GET()->interp;
3256 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3257 cannot use it to encode and decode filenames before it is loaded. Load
3258 the Python codec requires to encode at least its own filename. Use the C
3259 version of the locale codec until the codec registry is initialized and
3260 the Python codec is loaded.
3261
3262 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3263 cannot only rely on it: check also interp->fscodec_initialized for
3264 subinterpreters. */
3265 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003266 return PyUnicode_AsEncodedString(unicode,
3267 Py_FileSystemDefaultEncoding,
3268 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003269 }
3270 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003271 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003272 }
Victor Stinnerad158722010-10-27 00:25:46 +00003273#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003274}
3275
Alexander Belopolsky40018472011-02-26 01:02:56 +00003276PyObject *
3277PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003278 const char *encoding,
3279 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003280{
3281 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003282 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003283
Guido van Rossumd57fd912000-03-10 22:53:23 +00003284 if (!PyUnicode_Check(unicode)) {
3285 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 }
Fred Drakee4315f52000-05-09 19:53:39 +00003288
Fred Drakee4315f52000-05-09 19:53:39 +00003289 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003290 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003291 if ((strcmp(lower, "utf-8") == 0) ||
3292 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003293 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003294 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003295 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003296 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003297 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003298 }
Victor Stinner37296e82010-06-10 13:36:23 +00003299 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003300 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003301 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003302 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003303#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003304 else if (strcmp(lower, "mbcs") == 0)
3305 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003306#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003307 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003308 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310
3311 /* Encode via the codec registry */
3312 v = PyCodec_Encode(unicode, encoding, errors);
3313 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003314 return NULL;
3315
3316 /* The normal path */
3317 if (PyBytes_Check(v))
3318 return v;
3319
3320 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003321 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003322 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003323 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003324
3325 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3326 "encoder %s returned bytearray instead of bytes",
3327 encoding);
3328 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003329 Py_DECREF(v);
3330 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003331 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003332
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003333 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3334 Py_DECREF(v);
3335 return b;
3336 }
3337
3338 PyErr_Format(PyExc_TypeError,
3339 "encoder did not return a bytes object (type=%.400s)",
3340 Py_TYPE(v)->tp_name);
3341 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003342 return NULL;
3343}
3344
Alexander Belopolsky40018472011-02-26 01:02:56 +00003345PyObject *
3346PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003347 const char *encoding,
3348 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003349{
3350 PyObject *v;
3351
3352 if (!PyUnicode_Check(unicode)) {
3353 PyErr_BadArgument();
3354 goto onError;
3355 }
3356
3357 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003358 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003359
3360 /* Encode via the codec registry */
3361 v = PyCodec_Encode(unicode, encoding, errors);
3362 if (v == NULL)
3363 goto onError;
3364 if (!PyUnicode_Check(v)) {
3365 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003366 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003367 Py_TYPE(v)->tp_name);
3368 Py_DECREF(v);
3369 goto onError;
3370 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003372
Benjamin Peterson29060642009-01-31 22:14:21 +00003373 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003374 return NULL;
3375}
3376
Victor Stinner2f197072011-12-17 07:08:30 +01003377static size_t
3378mbstowcs_errorpos(const char *str, size_t len)
3379{
3380#ifdef HAVE_MBRTOWC
3381 const char *start = str;
3382 mbstate_t mbs;
3383 size_t converted;
3384 wchar_t ch;
3385
3386 memset(&mbs, 0, sizeof mbs);
3387 while (len)
3388 {
3389 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3390 if (converted == 0)
3391 /* Reached end of string */
3392 break;
3393 if (converted == (size_t)-1 || converted == (size_t)-2) {
3394 /* Conversion error or incomplete character */
3395 return str - start;
3396 }
3397 else {
3398 str += converted;
3399 len -= converted;
3400 }
3401 }
3402 /* failed to find the undecodable byte sequence */
3403 return 0;
3404#endif
3405 return 0;
3406}
3407
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003408PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003409PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003410 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003411{
3412 wchar_t smallbuf[256];
3413 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3414 wchar_t *wstr;
3415 size_t wlen, wlen2;
3416 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003417 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003418 size_t error_pos;
3419 char *errmsg;
3420 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003421
3422 if (locale_error_handler(errors, &surrogateescape) < 0)
3423 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003424
3425 if (str[len] != '\0' || len != strlen(str)) {
3426 PyErr_SetString(PyExc_TypeError, "embedded null character");
3427 return NULL;
3428 }
3429
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003430 if (surrogateescape) {
3431 /* "surrogateescape" error handler */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003432 wstr = _Py_char2wchar(str, &wlen);
3433 if (wstr == NULL) {
3434 if (wlen == (size_t)-1)
3435 PyErr_NoMemory();
3436 else
3437 PyErr_SetFromErrno(PyExc_OSError);
3438 return NULL;
3439 }
3440
3441 unicode = PyUnicode_FromWideChar(wstr, wlen);
3442 PyMem_Free(wstr);
3443 }
3444 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003445 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003446#ifndef HAVE_BROKEN_MBSTOWCS
3447 wlen = mbstowcs(NULL, str, 0);
3448#else
3449 wlen = len;
3450#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003451 if (wlen == (size_t)-1)
3452 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003453 if (wlen+1 <= smallbuf_len) {
3454 wstr = smallbuf;
3455 }
3456 else {
3457 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3458 return PyErr_NoMemory();
3459
3460 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3461 if (!wstr)
3462 return PyErr_NoMemory();
3463 }
3464
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003465 wlen2 = mbstowcs(wstr, str, wlen+1);
3466 if (wlen2 == (size_t)-1) {
3467 if (wstr != smallbuf)
3468 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003469 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003470 }
3471#ifdef HAVE_BROKEN_MBSTOWCS
3472 assert(wlen2 == wlen);
3473#endif
3474 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3475 if (wstr != smallbuf)
3476 PyMem_Free(wstr);
3477 }
3478 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003479
3480decode_error:
3481 errmsg = strerror(errno);
3482 assert(errmsg != NULL);
3483
3484 error_pos = mbstowcs_errorpos(str, len);
3485 if (errmsg != NULL) {
3486 size_t errlen;
3487 wstr = _Py_char2wchar(errmsg, &errlen);
3488 if (wstr != NULL) {
3489 reason = PyUnicode_FromWideChar(wstr, errlen);
3490 PyMem_Free(wstr);
3491 } else
3492 errmsg = NULL;
3493 }
3494 if (errmsg == NULL)
3495 reason = PyUnicode_FromString(
3496 "mbstowcs() encountered an invalid multibyte sequence");
3497 if (reason == NULL)
3498 return NULL;
3499
3500 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3501 "locale", str, len,
3502 (Py_ssize_t)error_pos,
3503 (Py_ssize_t)(error_pos+1),
3504 reason);
3505 Py_DECREF(reason);
3506 if (exc != NULL) {
3507 PyCodec_StrictErrors(exc);
3508 Py_XDECREF(exc);
3509 }
3510 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003511}
3512
3513PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003514PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003515{
3516 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003517 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003518}
3519
3520
3521PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003522PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003523 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003524 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3525}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003526
Christian Heimes5894ba72007-11-04 11:43:14 +00003527PyObject*
3528PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3529{
Victor Stinner99b95382011-07-04 14:23:54 +02003530#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003531 return PyUnicode_DecodeMBCS(s, size, NULL);
3532#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003533 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003534#else
Victor Stinner793b5312011-04-27 00:24:21 +02003535 PyInterpreterState *interp = PyThreadState_GET()->interp;
3536 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3537 cannot use it to encode and decode filenames before it is loaded. Load
3538 the Python codec requires to encode at least its own filename. Use the C
3539 version of the locale codec until the codec registry is initialized and
3540 the Python codec is loaded.
3541
3542 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3543 cannot only rely on it: check also interp->fscodec_initialized for
3544 subinterpreters. */
3545 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003546 return PyUnicode_Decode(s, size,
3547 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003548 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003549 }
3550 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003551 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003552 }
Victor Stinnerad158722010-10-27 00:25:46 +00003553#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003554}
3555
Martin v. Löwis011e8422009-05-05 04:43:17 +00003556
3557int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003558_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003559{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003560 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003561
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003562 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003563 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003564 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3565 PyUnicode_GET_LENGTH(str), '\0', 1);
3566 if (pos == -1)
3567 return 0;
3568 else
3569 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003570}
3571
Antoine Pitrou13348842012-01-29 18:36:34 +01003572int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003573PyUnicode_FSConverter(PyObject* arg, void* addr)
3574{
3575 PyObject *output = NULL;
3576 Py_ssize_t size;
3577 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003578 if (arg == NULL) {
3579 Py_DECREF(*(PyObject**)addr);
3580 return 1;
3581 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003582 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003583 output = arg;
3584 Py_INCREF(output);
3585 }
3586 else {
3587 arg = PyUnicode_FromObject(arg);
3588 if (!arg)
3589 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003590 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003591 Py_DECREF(arg);
3592 if (!output)
3593 return 0;
3594 if (!PyBytes_Check(output)) {
3595 Py_DECREF(output);
3596 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3597 return 0;
3598 }
3599 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003600 size = PyBytes_GET_SIZE(output);
3601 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003602 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003603 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003604 Py_DECREF(output);
3605 return 0;
3606 }
3607 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003608 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003609}
3610
3611
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003612int
3613PyUnicode_FSDecoder(PyObject* arg, void* addr)
3614{
3615 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003616 if (arg == NULL) {
3617 Py_DECREF(*(PyObject**)addr);
3618 return 1;
3619 }
3620 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003621 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003622 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003623 output = arg;
3624 Py_INCREF(output);
3625 }
3626 else {
3627 arg = PyBytes_FromObject(arg);
3628 if (!arg)
3629 return 0;
3630 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3631 PyBytes_GET_SIZE(arg));
3632 Py_DECREF(arg);
3633 if (!output)
3634 return 0;
3635 if (!PyUnicode_Check(output)) {
3636 Py_DECREF(output);
3637 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3638 return 0;
3639 }
3640 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003641 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003642 Py_DECREF(output);
3643 return 0;
3644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003645 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003646 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003647 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3648 Py_DECREF(output);
3649 return 0;
3650 }
3651 *(PyObject**)addr = output;
3652 return Py_CLEANUP_SUPPORTED;
3653}
3654
3655
Martin v. Löwis5b222132007-06-10 09:51:05 +00003656char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003657PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003658{
Christian Heimesf3863112007-11-22 07:46:41 +00003659 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003660
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003661 if (!PyUnicode_Check(unicode)) {
3662 PyErr_BadArgument();
3663 return NULL;
3664 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003665 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003666 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003667
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003668 if (PyUnicode_UTF8(unicode) == NULL) {
3669 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003670 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3671 if (bytes == NULL)
3672 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003673 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3674 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003675 Py_DECREF(bytes);
3676 return NULL;
3677 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003678 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3679 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3680 PyBytes_AS_STRING(bytes),
3681 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003682 Py_DECREF(bytes);
3683 }
3684
3685 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003686 *psize = PyUnicode_UTF8_LENGTH(unicode);
3687 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003688}
3689
3690char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003691PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003693 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3694}
3695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003696Py_UNICODE *
3697PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003699 const unsigned char *one_byte;
3700#if SIZEOF_WCHAR_T == 4
3701 const Py_UCS2 *two_bytes;
3702#else
3703 const Py_UCS4 *four_bytes;
3704 const Py_UCS4 *ucs4_end;
3705 Py_ssize_t num_surrogates;
3706#endif
3707 wchar_t *w;
3708 wchar_t *wchar_end;
3709
3710 if (!PyUnicode_Check(unicode)) {
3711 PyErr_BadArgument();
3712 return NULL;
3713 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003714 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003715 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003716 assert(_PyUnicode_KIND(unicode) != 0);
3717 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003719 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003721 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3722 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003723 num_surrogates = 0;
3724
3725 for (; four_bytes < ucs4_end; ++four_bytes) {
3726 if (*four_bytes > 0xFFFF)
3727 ++num_surrogates;
3728 }
3729
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003730 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3731 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3732 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733 PyErr_NoMemory();
3734 return NULL;
3735 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003736 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003738 w = _PyUnicode_WSTR(unicode);
3739 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3740 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3742 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003743 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003745 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3746 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003747 }
3748 else
3749 *w = *four_bytes;
3750
3751 if (w > wchar_end) {
3752 assert(0 && "Miscalculated string end");
3753 }
3754 }
3755 *w = 0;
3756#else
3757 /* sizeof(wchar_t) == 4 */
3758 Py_FatalError("Impossible unicode object state, wstr and str "
3759 "should share memory already.");
3760 return NULL;
3761#endif
3762 }
3763 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003764 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3765 (_PyUnicode_LENGTH(unicode) + 1));
3766 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767 PyErr_NoMemory();
3768 return NULL;
3769 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003770 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3771 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3772 w = _PyUnicode_WSTR(unicode);
3773 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003774
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003775 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3776 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003777 for (; w < wchar_end; ++one_byte, ++w)
3778 *w = *one_byte;
3779 /* null-terminate the wstr */
3780 *w = 0;
3781 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003782 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003784 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 for (; w < wchar_end; ++two_bytes, ++w)
3786 *w = *two_bytes;
3787 /* null-terminate the wstr */
3788 *w = 0;
3789#else
3790 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003791 PyObject_FREE(_PyUnicode_WSTR(unicode));
3792 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003793 Py_FatalError("Impossible unicode object state, wstr "
3794 "and str should share memory already.");
3795 return NULL;
3796#endif
3797 }
3798 else {
3799 assert(0 && "This should never happen.");
3800 }
3801 }
3802 }
3803 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003804 *size = PyUnicode_WSTR_LENGTH(unicode);
3805 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003806}
3807
Alexander Belopolsky40018472011-02-26 01:02:56 +00003808Py_UNICODE *
3809PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003812}
3813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814
Alexander Belopolsky40018472011-02-26 01:02:56 +00003815Py_ssize_t
3816PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817{
3818 if (!PyUnicode_Check(unicode)) {
3819 PyErr_BadArgument();
3820 goto onError;
3821 }
3822 return PyUnicode_GET_SIZE(unicode);
3823
Benjamin Peterson29060642009-01-31 22:14:21 +00003824 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003825 return -1;
3826}
3827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828Py_ssize_t
3829PyUnicode_GetLength(PyObject *unicode)
3830{
Victor Stinner07621332012-06-16 04:53:46 +02003831 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832 PyErr_BadArgument();
3833 return -1;
3834 }
Victor Stinner07621332012-06-16 04:53:46 +02003835 if (PyUnicode_READY(unicode) == -1)
3836 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837 return PyUnicode_GET_LENGTH(unicode);
3838}
3839
3840Py_UCS4
3841PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3842{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003843 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3844 PyErr_BadArgument();
3845 return (Py_UCS4)-1;
3846 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003847 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003848 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849 return (Py_UCS4)-1;
3850 }
3851 return PyUnicode_READ_CHAR(unicode, index);
3852}
3853
3854int
3855PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3856{
3857 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003858 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 return -1;
3860 }
Victor Stinner488fa492011-12-12 00:01:39 +01003861 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003862 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003863 PyErr_SetString(PyExc_IndexError, "string index out of range");
3864 return -1;
3865 }
Victor Stinner488fa492011-12-12 00:01:39 +01003866 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003867 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003868 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3869 PyErr_SetString(PyExc_ValueError, "character out of range");
3870 return -1;
3871 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3873 index, ch);
3874 return 0;
3875}
3876
Alexander Belopolsky40018472011-02-26 01:02:56 +00003877const char *
3878PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003879{
Victor Stinner42cb4622010-09-01 19:39:01 +00003880 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003881}
3882
Victor Stinner554f3f02010-06-16 23:33:54 +00003883/* create or adjust a UnicodeDecodeError */
3884static void
3885make_decode_exception(PyObject **exceptionObject,
3886 const char *encoding,
3887 const char *input, Py_ssize_t length,
3888 Py_ssize_t startpos, Py_ssize_t endpos,
3889 const char *reason)
3890{
3891 if (*exceptionObject == NULL) {
3892 *exceptionObject = PyUnicodeDecodeError_Create(
3893 encoding, input, length, startpos, endpos, reason);
3894 }
3895 else {
3896 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3897 goto onError;
3898 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3899 goto onError;
3900 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3901 goto onError;
3902 }
3903 return;
3904
3905onError:
3906 Py_DECREF(*exceptionObject);
3907 *exceptionObject = NULL;
3908}
3909
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003910#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003911/* error handling callback helper:
3912 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003913 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003914 and adjust various state variables.
3915 return 0 on success, -1 on error
3916*/
3917
Alexander Belopolsky40018472011-02-26 01:02:56 +00003918static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003919unicode_decode_call_errorhandler_wchar(
3920 const char *errors, PyObject **errorHandler,
3921 const char *encoding, const char *reason,
3922 const char **input, const char **inend, Py_ssize_t *startinpos,
3923 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3924 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003926 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003927
3928 PyObject *restuple = NULL;
3929 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003930 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003931 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003932 Py_ssize_t requiredsize;
3933 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003934 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003935 wchar_t *repwstr;
3936 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003938 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3939 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003940
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003941 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003942 *errorHandler = PyCodec_LookupError(errors);
3943 if (*errorHandler == NULL)
3944 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945 }
3946
Victor Stinner554f3f02010-06-16 23:33:54 +00003947 make_decode_exception(exceptionObject,
3948 encoding,
3949 *input, *inend - *input,
3950 *startinpos, *endinpos,
3951 reason);
3952 if (*exceptionObject == NULL)
3953 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003954
3955 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3956 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003957 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003959 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003960 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 }
3962 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003963 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003964
3965 /* Copy back the bytes variables, which might have been modified by the
3966 callback */
3967 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3968 if (!inputobj)
3969 goto onError;
3970 if (!PyBytes_Check(inputobj)) {
3971 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3972 }
3973 *input = PyBytes_AS_STRING(inputobj);
3974 insize = PyBytes_GET_SIZE(inputobj);
3975 *inend = *input + insize;
3976 /* we can DECREF safely, as the exception has another reference,
3977 so the object won't go away. */
3978 Py_DECREF(inputobj);
3979
3980 if (newpos<0)
3981 newpos = insize+newpos;
3982 if (newpos<0 || newpos>insize) {
3983 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3984 goto onError;
3985 }
3986
3987 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3988 if (repwstr == NULL)
3989 goto onError;
3990 /* need more space? (at least enough for what we
3991 have+the replacement+the rest of the string (starting
3992 at the new input position), so we won't have to check space
3993 when there are no errors in the rest of the string) */
3994 requiredsize = *outpos + repwlen + insize-newpos;
3995 if (requiredsize > outsize) {
3996 if (requiredsize < 2*outsize)
3997 requiredsize = 2*outsize;
3998 if (unicode_resize(output, requiredsize) < 0)
3999 goto onError;
4000 }
4001 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4002 *outpos += repwlen;
4003
4004 *endinpos = newpos;
4005 *inptr = *input + newpos;
4006
4007 /* we made it! */
4008 Py_XDECREF(restuple);
4009 return 0;
4010
4011 onError:
4012 Py_XDECREF(restuple);
4013 return -1;
4014}
4015#endif /* HAVE_MBCS */
4016
4017static int
4018unicode_decode_call_errorhandler_writer(
4019 const char *errors, PyObject **errorHandler,
4020 const char *encoding, const char *reason,
4021 const char **input, const char **inend, Py_ssize_t *startinpos,
4022 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4023 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4024{
4025 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4026
4027 PyObject *restuple = NULL;
4028 PyObject *repunicode = NULL;
4029 Py_ssize_t insize;
4030 Py_ssize_t newpos;
4031 PyObject *inputobj = NULL;
4032
4033 if (*errorHandler == NULL) {
4034 *errorHandler = PyCodec_LookupError(errors);
4035 if (*errorHandler == NULL)
4036 goto onError;
4037 }
4038
4039 make_decode_exception(exceptionObject,
4040 encoding,
4041 *input, *inend - *input,
4042 *startinpos, *endinpos,
4043 reason);
4044 if (*exceptionObject == NULL)
4045 goto onError;
4046
4047 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4048 if (restuple == NULL)
4049 goto onError;
4050 if (!PyTuple_Check(restuple)) {
4051 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4052 goto onError;
4053 }
4054 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004055 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004056
4057 /* Copy back the bytes variables, which might have been modified by the
4058 callback */
4059 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4060 if (!inputobj)
4061 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004062 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004063 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004064 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004065 *input = PyBytes_AS_STRING(inputobj);
4066 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004067 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004068 /* we can DECREF safely, as the exception has another reference,
4069 so the object won't go away. */
4070 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004071
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004072 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004073 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004074 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004075 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4076 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004077 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004078
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004079 writer->overallocate = 1;
4080 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4081 return
4082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004084 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004085
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004086 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004087 Py_XDECREF(restuple);
4088 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089
Benjamin Peterson29060642009-01-31 22:14:21 +00004090 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004091 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004092 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093}
4094
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004095/* --- UTF-7 Codec -------------------------------------------------------- */
4096
Antoine Pitrou244651a2009-05-04 18:56:13 +00004097/* See RFC2152 for details. We encode conservatively and decode liberally. */
4098
4099/* Three simple macros defining base-64. */
4100
4101/* Is c a base-64 character? */
4102
4103#define IS_BASE64(c) \
4104 (((c) >= 'A' && (c) <= 'Z') || \
4105 ((c) >= 'a' && (c) <= 'z') || \
4106 ((c) >= '0' && (c) <= '9') || \
4107 (c) == '+' || (c) == '/')
4108
4109/* given that c is a base-64 character, what is its base-64 value? */
4110
4111#define FROM_BASE64(c) \
4112 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4113 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4114 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4115 (c) == '+' ? 62 : 63)
4116
4117/* What is the base-64 character of the bottom 6 bits of n? */
4118
4119#define TO_BASE64(n) \
4120 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4121
4122/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4123 * decoded as itself. We are permissive on decoding; the only ASCII
4124 * byte not decoding to itself is the + which begins a base64
4125 * string. */
4126
4127#define DECODE_DIRECT(c) \
4128 ((c) <= 127 && (c) != '+')
4129
4130/* The UTF-7 encoder treats ASCII characters differently according to
4131 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4132 * the above). See RFC2152. This array identifies these different
4133 * sets:
4134 * 0 : "Set D"
4135 * alphanumeric and '(),-./:?
4136 * 1 : "Set O"
4137 * !"#$%&*;<=>@[]^_`{|}
4138 * 2 : "whitespace"
4139 * ht nl cr sp
4140 * 3 : special (must be base64 encoded)
4141 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4142 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004143
Tim Petersced69f82003-09-16 20:30:58 +00004144static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004145char utf7_category[128] = {
4146/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4147 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4148/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4149 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4150/* sp ! " # $ % & ' ( ) * + , - . / */
4151 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4152/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4154/* @ A B C D E F G H I J K L M N O */
4155 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4156/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4157 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4158/* ` a b c d e f g h i j k l m n o */
4159 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4160/* p q r s t u v w x y z { | } ~ del */
4161 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004162};
4163
Antoine Pitrou244651a2009-05-04 18:56:13 +00004164/* ENCODE_DIRECT: this character should be encoded as itself. The
4165 * answer depends on whether we are encoding set O as itself, and also
4166 * on whether we are encoding whitespace as itself. RFC2152 makes it
4167 * clear that the answers to these questions vary between
4168 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004169
Antoine Pitrou244651a2009-05-04 18:56:13 +00004170#define ENCODE_DIRECT(c, directO, directWS) \
4171 ((c) < 128 && (c) > 0 && \
4172 ((utf7_category[(c)] == 0) || \
4173 (directWS && (utf7_category[(c)] == 2)) || \
4174 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004175
Alexander Belopolsky40018472011-02-26 01:02:56 +00004176PyObject *
4177PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004178 Py_ssize_t size,
4179 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004180{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004181 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4182}
4183
Antoine Pitrou244651a2009-05-04 18:56:13 +00004184/* The decoder. The only state we preserve is our read position,
4185 * i.e. how many characters we have consumed. So if we end in the
4186 * middle of a shift sequence we have to back off the read position
4187 * and the output to the beginning of the sequence, otherwise we lose
4188 * all the shift state (seen bits, number of bits seen, high
4189 * surrogate). */
4190
Alexander Belopolsky40018472011-02-26 01:02:56 +00004191PyObject *
4192PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004193 Py_ssize_t size,
4194 const char *errors,
4195 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004196{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004198 Py_ssize_t startinpos;
4199 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004200 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004201 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004202 const char *errmsg = "";
4203 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004204 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004205 unsigned int base64bits = 0;
4206 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004207 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 PyObject *errorHandler = NULL;
4209 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004210
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004211 if (size == 0) {
4212 if (consumed)
4213 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004214 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004215 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004216
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004217 /* Start off assuming it's all ASCII. Widen later as necessary. */
4218 _PyUnicodeWriter_Init(&writer, 0);
4219 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4220 goto onError;
4221
4222 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004223 e = s + size;
4224
4225 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004226 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004227 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004228 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004229
Antoine Pitrou244651a2009-05-04 18:56:13 +00004230 if (inShift) { /* in a base-64 section */
4231 if (IS_BASE64(ch)) { /* consume a base-64 character */
4232 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4233 base64bits += 6;
4234 s++;
4235 if (base64bits >= 16) {
4236 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004237 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004238 base64bits -= 16;
4239 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4240 if (surrogate) {
4241 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004242 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4243 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004244 if (_PyUnicodeWriter_Prepare(&writer, 1, ch2) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004245 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004246 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch2);
4247 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004248 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004249 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004250 }
4251 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004252 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004253 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004254 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4255 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004256 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004257 }
4258 }
Victor Stinner551ac952011-11-29 22:58:13 +01004259 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004260 /* first surrogate */
4261 surrogate = outCh;
4262 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004263 else {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004264 if (_PyUnicodeWriter_Prepare(&writer, 1, outCh) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004265 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, outCh);
4267 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004268 }
4269 }
4270 }
4271 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004272 inShift = 0;
4273 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004274 if (surrogate) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004275 if (_PyUnicodeWriter_Prepare(&writer, 1, surrogate) == -1)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004276 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004277 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, surrogate);
4278 writer.pos++;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004279 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004281 if (base64bits > 0) { /* left-over bits */
4282 if (base64bits >= 6) {
4283 /* We've seen at least one base-64 character */
4284 errmsg = "partial character in shift sequence";
4285 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287 else {
4288 /* Some bits remain; they should be zero */
4289 if (base64buffer != 0) {
4290 errmsg = "non-zero padding bits in shift sequence";
4291 goto utf7Error;
4292 }
4293 }
4294 }
4295 if (ch != '-') {
4296 /* '-' is absorbed; other terminating
4297 characters are preserved */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004299 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004300 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4301 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004302 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004303 }
4304 }
4305 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307 s++; /* consume '+' */
4308 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004309 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310 if (_PyUnicodeWriter_Prepare(&writer, 1, '+') == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004311 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004312 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '+');
4313 writer.pos++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 }
4315 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004316 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004317 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004319 }
4320 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004321 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004322 s++;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004323 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
4324 goto onError;
4325 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4326 writer.pos++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004327 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 else {
4329 startinpos = s-starts;
4330 s++;
4331 errmsg = "unexpected special character";
4332 goto utf7Error;
4333 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004335utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004337 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004338 errors, &errorHandler,
4339 "utf7", errmsg,
4340 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004341 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004342 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343 }
4344
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 /* end of string */
4346
4347 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4348 /* if we're in an inconsistent state, that's an error */
4349 if (surrogate ||
4350 (base64bits >= 6) ||
4351 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004353 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004354 errors, &errorHandler,
4355 "utf7", "unterminated shift sequence",
4356 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004357 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 goto onError;
4359 if (s < e)
4360 goto restart;
4361 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004362 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363
4364 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004365 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366 if (inShift) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004367 writer.pos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004368 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 }
4370 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004371 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004373 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375 Py_XDECREF(errorHandler);
4376 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004377 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378
Benjamin Peterson29060642009-01-31 22:14:21 +00004379 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004380 Py_XDECREF(errorHandler);
4381 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004382 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383 return NULL;
4384}
4385
4386
Alexander Belopolsky40018472011-02-26 01:02:56 +00004387PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004388_PyUnicode_EncodeUTF7(PyObject *str,
4389 int base64SetO,
4390 int base64WhiteSpace,
4391 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004392{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004393 int kind;
4394 void *data;
4395 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004396 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004397 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004398 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 unsigned int base64bits = 0;
4400 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 char * out;
4402 char * start;
4403
Benjamin Petersonbac79492012-01-14 13:34:47 -05004404 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004405 return NULL;
4406 kind = PyUnicode_KIND(str);
4407 data = PyUnicode_DATA(str);
4408 len = PyUnicode_GET_LENGTH(str);
4409
4410 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004411 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004413 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004414 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004415 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004416 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 if (v == NULL)
4418 return NULL;
4419
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004420 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004421 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004422 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 if (inShift) {
4425 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4426 /* shifting out */
4427 if (base64bits) { /* output remaining bits */
4428 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4429 base64buffer = 0;
4430 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004431 }
4432 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433 /* Characters not in the BASE64 set implicitly unshift the sequence
4434 so no '-' is required, except if the character is itself a '-' */
4435 if (IS_BASE64(ch) || ch == '-') {
4436 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 *out++ = (char) ch;
4439 }
4440 else {
4441 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004442 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004444 else { /* not in a shift sequence */
4445 if (ch == '+') {
4446 *out++ = '+';
4447 *out++ = '-';
4448 }
4449 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4450 *out++ = (char) ch;
4451 }
4452 else {
4453 *out++ = '+';
4454 inShift = 1;
4455 goto encode_char;
4456 }
4457 }
4458 continue;
4459encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004461 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004462
Antoine Pitrou244651a2009-05-04 18:56:13 +00004463 /* code first surrogate */
4464 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004465 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 while (base64bits >= 6) {
4467 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4468 base64bits -= 6;
4469 }
4470 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004471 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004472 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 base64bits += 16;
4474 base64buffer = (base64buffer << 16) | ch;
4475 while (base64bits >= 6) {
4476 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4477 base64bits -= 6;
4478 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004479 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004480 if (base64bits)
4481 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4482 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004484 if (_PyBytes_Resize(&v, out - start) < 0)
4485 return NULL;
4486 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004487}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004488PyObject *
4489PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4490 Py_ssize_t size,
4491 int base64SetO,
4492 int base64WhiteSpace,
4493 const char *errors)
4494{
4495 PyObject *result;
4496 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4497 if (tmp == NULL)
4498 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004499 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004500 base64WhiteSpace, errors);
4501 Py_DECREF(tmp);
4502 return result;
4503}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505#undef IS_BASE64
4506#undef FROM_BASE64
4507#undef TO_BASE64
4508#undef DECODE_DIRECT
4509#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004510
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511/* --- UTF-8 Codec -------------------------------------------------------- */
4512
Alexander Belopolsky40018472011-02-26 01:02:56 +00004513PyObject *
4514PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004515 Py_ssize_t size,
4516 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004517{
Walter Dörwald69652032004-09-07 20:24:22 +00004518 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4519}
4520
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004521#include "stringlib/asciilib.h"
4522#include "stringlib/codecs.h"
4523#include "stringlib/undef.h"
4524
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004525#include "stringlib/ucs1lib.h"
4526#include "stringlib/codecs.h"
4527#include "stringlib/undef.h"
4528
4529#include "stringlib/ucs2lib.h"
4530#include "stringlib/codecs.h"
4531#include "stringlib/undef.h"
4532
4533#include "stringlib/ucs4lib.h"
4534#include "stringlib/codecs.h"
4535#include "stringlib/undef.h"
4536
Antoine Pitrouab868312009-01-10 15:40:25 +00004537/* Mask to quickly check whether a C 'long' contains a
4538 non-ASCII, UTF8-encoded char. */
4539#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004540# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004541#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004542# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004543#else
4544# error C 'long' size should be either 4 or 8!
4545#endif
4546
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004547static Py_ssize_t
4548ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004549{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004550 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004551 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004552
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004553#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004554 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4555 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004556 /* Fast path, see in STRINGLIB(utf8_decode) for
4557 an explanation. */
4558 /* Help register allocation */
4559 register const char *_p = p;
4560 register Py_UCS1 * q = dest;
4561 while (_p < aligned_end) {
4562 unsigned long value = *(const unsigned long *) _p;
4563 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004564 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004565 *((unsigned long *)q) = value;
4566 _p += SIZEOF_LONG;
4567 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004568 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004569 p = _p;
4570 while (p < end) {
4571 if ((unsigned char)*p & 0x80)
4572 break;
4573 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004575 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004577#endif
4578 while (p < end) {
4579 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4580 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004581 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004582 /* Help register allocation */
4583 register const char *_p = p;
4584 while (_p < aligned_end) {
4585 unsigned long value = *(unsigned long *) _p;
4586 if (value & ASCII_CHAR_MASK)
4587 break;
4588 _p += SIZEOF_LONG;
4589 }
4590 p = _p;
4591 if (_p == end)
4592 break;
4593 }
4594 if ((unsigned char)*p & 0x80)
4595 break;
4596 ++p;
4597 }
4598 memcpy(dest, start, p - start);
4599 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600}
Antoine Pitrouab868312009-01-10 15:40:25 +00004601
Victor Stinner785938e2011-12-11 20:09:03 +01004602PyObject *
4603PyUnicode_DecodeUTF8Stateful(const char *s,
4604 Py_ssize_t size,
4605 const char *errors,
4606 Py_ssize_t *consumed)
4607{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004608 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004609 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004610 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004611
4612 Py_ssize_t startinpos;
4613 Py_ssize_t endinpos;
4614 const char *errmsg = "";
4615 PyObject *errorHandler = NULL;
4616 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004617
4618 if (size == 0) {
4619 if (consumed)
4620 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004621 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004622 }
4623
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004624 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4625 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004626 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004627 *consumed = 1;
4628 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004629 }
4630
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004631 _PyUnicodeWriter_Init(&writer, 0);
4632 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
4633 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004634
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004635 writer.pos = ascii_decode(s, end, writer.data);
4636 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004637 while (s < end) {
4638 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004639 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004640 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004641 if (PyUnicode_IS_ASCII(writer.buffer))
4642 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004643 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004644 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004645 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004646 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004647 } else {
4648 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004649 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004650 }
4651
4652 switch (ch) {
4653 case 0:
4654 if (s == end || consumed)
4655 goto End;
4656 errmsg = "unexpected end of data";
4657 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004658 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004659 break;
4660 case 1:
4661 errmsg = "invalid start byte";
4662 startinpos = s - starts;
4663 endinpos = startinpos + 1;
4664 break;
4665 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004666 case 3:
4667 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004668 errmsg = "invalid continuation byte";
4669 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004670 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004671 break;
4672 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004673 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004675 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4676 writer.pos++;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 continue;
4678 }
4679
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004681 errors, &errorHandler,
4682 "utf-8", errmsg,
4683 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004684 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004685 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004686 }
4687
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004689 if (consumed)
4690 *consumed = s - starts;
4691
4692 Py_XDECREF(errorHandler);
4693 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004694 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695
4696onError:
4697 Py_XDECREF(errorHandler);
4698 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004699 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004700 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004701}
4702
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004703#ifdef __APPLE__
4704
4705/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004706 used to decode the command line arguments on Mac OS X.
4707
4708 Return a pointer to a newly allocated wide character string (use
4709 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004710
4711wchar_t*
4712_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4713{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004714 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004715 wchar_t *unicode;
4716 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004717
4718 /* Note: size will always be longer than the resulting Unicode
4719 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004720 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004721 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004722 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4723 if (!unicode)
4724 return NULL;
4725
4726 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004727 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004728 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004729 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004731#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004732 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004733#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004735#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004736 if (ch > 0xFF) {
4737#if SIZEOF_WCHAR_T == 4
4738 assert(0);
4739#else
4740 assert(Py_UNICODE_IS_SURROGATE(ch));
4741 /* compute and append the two surrogates: */
4742 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4743 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4744#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004745 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004746 else {
4747 if (!ch && s == e)
4748 break;
4749 /* surrogateescape */
4750 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4751 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004752 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004753 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004754 return unicode;
4755}
4756
4757#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004759/* Primary internal function which creates utf8 encoded bytes objects.
4760
4761 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004762 and allocate exactly as much space needed at the end. Else allocate the
4763 maximum possible needed (4 result bytes per Unicode character), and return
4764 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004765*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004766PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004767_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768{
Victor Stinner6099a032011-12-18 14:22:26 +01004769 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004770 void *data;
4771 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004773 if (!PyUnicode_Check(unicode)) {
4774 PyErr_BadArgument();
4775 return NULL;
4776 }
4777
4778 if (PyUnicode_READY(unicode) == -1)
4779 return NULL;
4780
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004781 if (PyUnicode_UTF8(unicode))
4782 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4783 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004784
4785 kind = PyUnicode_KIND(unicode);
4786 data = PyUnicode_DATA(unicode);
4787 size = PyUnicode_GET_LENGTH(unicode);
4788
Benjamin Petersonead6b532011-12-20 17:23:42 -06004789 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004790 default:
4791 assert(0);
4792 case PyUnicode_1BYTE_KIND:
4793 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4794 assert(!PyUnicode_IS_ASCII(unicode));
4795 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4796 case PyUnicode_2BYTE_KIND:
4797 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4798 case PyUnicode_4BYTE_KIND:
4799 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004800 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801}
4802
Alexander Belopolsky40018472011-02-26 01:02:56 +00004803PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004804PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4805 Py_ssize_t size,
4806 const char *errors)
4807{
4808 PyObject *v, *unicode;
4809
4810 unicode = PyUnicode_FromUnicode(s, size);
4811 if (unicode == NULL)
4812 return NULL;
4813 v = _PyUnicode_AsUTF8String(unicode, errors);
4814 Py_DECREF(unicode);
4815 return v;
4816}
4817
4818PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004819PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004821 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822}
4823
Walter Dörwald41980ca2007-08-16 21:55:45 +00004824/* --- UTF-32 Codec ------------------------------------------------------- */
4825
4826PyObject *
4827PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004828 Py_ssize_t size,
4829 const char *errors,
4830 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004831{
4832 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4833}
4834
4835PyObject *
4836PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004837 Py_ssize_t size,
4838 const char *errors,
4839 int *byteorder,
4840 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004841{
4842 const char *starts = s;
4843 Py_ssize_t startinpos;
4844 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004845 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004846 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004847 int le, bo = 0; /* assume native ordering by default */
Walter Dörwald41980ca2007-08-16 21:55:45 +00004848 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004849 PyObject *errorHandler = NULL;
4850 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004851
Walter Dörwald41980ca2007-08-16 21:55:45 +00004852 q = (unsigned char *)s;
4853 e = q + size;
4854
4855 if (byteorder)
4856 bo = *byteorder;
4857
4858 /* Check for BOM marks (U+FEFF) in the input and adjust current
4859 byte order setting accordingly. In native mode, the leading BOM
4860 mark is skipped, in all other modes, it is copied to the output
4861 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004862 if (bo == 0 && size >= 4) {
4863 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4864 if (bom == 0x0000FEFF) {
4865 bo = -1;
4866 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004867 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004868 else if (bom == 0xFFFE0000) {
4869 bo = 1;
4870 q += 4;
4871 }
4872 if (byteorder)
4873 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004874 }
4875
Victor Stinnere64322e2012-10-30 23:12:47 +01004876 if (q == e) {
4877 if (consumed)
4878 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004879 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004880 }
4881
Victor Stinnere64322e2012-10-30 23:12:47 +01004882#ifdef WORDS_BIGENDIAN
4883 le = bo < 0;
4884#else
4885 le = bo <= 0;
4886#endif
4887
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004888 _PyUnicodeWriter_Init(&writer, 0);
4889 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1)
4890 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004891
Victor Stinnere64322e2012-10-30 23:12:47 +01004892 while (1) {
4893 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004894 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004895
Victor Stinnere64322e2012-10-30 23:12:47 +01004896 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004897 enum PyUnicode_Kind kind = writer.kind;
4898 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004899 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004900 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004901 if (le) {
4902 do {
4903 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4904 if (ch > maxch)
4905 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004906 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004907 q += 4;
4908 } while (q <= last);
4909 }
4910 else {
4911 do {
4912 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4913 if (ch > maxch)
4914 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004915 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004916 q += 4;
4917 } while (q <= last);
4918 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004919 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004920 }
4921
4922 if (ch <= maxch) {
4923 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004924 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01004925 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01004927 startinpos = ((const char *)q) - starts;
4928 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004930 else {
4931 if (ch < 0x110000) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004932 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinnere64322e2012-10-30 23:12:47 +01004933 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004934 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
4935 writer.pos++;
Victor Stinnere64322e2012-10-30 23:12:47 +01004936 q += 4;
4937 continue;
4938 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004939 errmsg = "codepoint not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01004940 startinpos = ((const char *)q) - starts;
4941 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004943
4944 /* The remaining input chars are ignored if the callback
4945 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004946 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 errors, &errorHandler,
4948 "utf32", errmsg,
4949 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004950 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952 }
4953
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957 Py_XDECREF(errorHandler);
4958 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004959 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004960
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004962 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004963 Py_XDECREF(errorHandler);
4964 Py_XDECREF(exc);
4965 return NULL;
4966}
4967
4968PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004969_PyUnicode_EncodeUTF32(PyObject *str,
4970 const char *errors,
4971 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004973 int kind;
4974 void *data;
4975 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004976 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004978 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004979 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02004980#if PY_LITTLE_ENDIAN
Walter Dörwald41980ca2007-08-16 21:55:45 +00004981 int iorder[] = {0, 1, 2, 3};
4982#else
4983 int iorder[] = {3, 2, 1, 0};
4984#endif
4985
Benjamin Peterson29060642009-01-31 22:14:21 +00004986#define STORECHAR(CH) \
4987 do { \
4988 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4989 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4990 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4991 p[iorder[0]] = (CH) & 0xff; \
4992 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004993 } while(0)
4994
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004995 if (!PyUnicode_Check(str)) {
4996 PyErr_BadArgument();
4997 return NULL;
4998 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004999 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005000 return NULL;
5001 kind = PyUnicode_KIND(str);
5002 data = PyUnicode_DATA(str);
5003 len = PyUnicode_GET_LENGTH(str);
5004
5005 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005006 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005008 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005009 if (v == NULL)
5010 return NULL;
5011
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005012 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005013 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005014 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005015 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005016 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005017
5018 if (byteorder == -1) {
5019 /* force LE */
5020 iorder[0] = 0;
5021 iorder[1] = 1;
5022 iorder[2] = 2;
5023 iorder[3] = 3;
5024 }
5025 else if (byteorder == 1) {
5026 /* force BE */
5027 iorder[0] = 3;
5028 iorder[1] = 2;
5029 iorder[2] = 1;
5030 iorder[3] = 0;
5031 }
5032
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005033 for (i = 0; i < len; i++)
5034 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005035
5036 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005037 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005038#undef STORECHAR
5039}
5040
Alexander Belopolsky40018472011-02-26 01:02:56 +00005041PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005042PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5043 Py_ssize_t size,
5044 const char *errors,
5045 int byteorder)
5046{
5047 PyObject *result;
5048 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5049 if (tmp == NULL)
5050 return NULL;
5051 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5052 Py_DECREF(tmp);
5053 return result;
5054}
5055
5056PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005057PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005058{
Victor Stinnerb960b342011-11-20 19:12:52 +01005059 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005060}
5061
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062/* --- UTF-16 Codec ------------------------------------------------------- */
5063
Tim Peters772747b2001-08-09 22:21:55 +00005064PyObject *
5065PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 Py_ssize_t size,
5067 const char *errors,
5068 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069{
Walter Dörwald69652032004-09-07 20:24:22 +00005070 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5071}
5072
5073PyObject *
5074PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 Py_ssize_t size,
5076 const char *errors,
5077 int *byteorder,
5078 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005079{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005080 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005081 Py_ssize_t startinpos;
5082 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005083 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005084 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005085 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005086 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005087 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005088 PyObject *errorHandler = NULL;
5089 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090
Tim Peters772747b2001-08-09 22:21:55 +00005091 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005092 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093
5094 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005095 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005097 /* Check for BOM marks (U+FEFF) in the input and adjust current
5098 byte order setting accordingly. In native mode, the leading BOM
5099 mark is skipped, in all other modes, it is copied to the output
5100 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005101 if (bo == 0 && size >= 2) {
5102 const Py_UCS4 bom = (q[1] << 8) | q[0];
5103 if (bom == 0xFEFF) {
5104 q += 2;
5105 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005106 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005107 else if (bom == 0xFFFE) {
5108 q += 2;
5109 bo = 1;
5110 }
5111 if (byteorder)
5112 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005113 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114
Antoine Pitrou63065d72012-05-15 23:48:04 +02005115 if (q == e) {
5116 if (consumed)
5117 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005118 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005119 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005120
Christian Heimes743e0cd2012-10-17 23:52:17 +02005121#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005122 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005123#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005124 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005125#endif
Tim Peters772747b2001-08-09 22:21:55 +00005126
Antoine Pitrou63065d72012-05-15 23:48:04 +02005127 /* Note: size will always be longer than the resulting Unicode
5128 character count */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005129 _PyUnicodeWriter_Init(&writer, 0);
5130 if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1)
5131 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005132
Antoine Pitrou63065d72012-05-15 23:48:04 +02005133 while (1) {
5134 Py_UCS4 ch = 0;
5135 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005136 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005137 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005138 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005139 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005140 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005141 native_ordering);
5142 else
5143 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005144 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005145 native_ordering);
5146 } else if (kind == PyUnicode_2BYTE_KIND) {
5147 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005148 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005149 native_ordering);
5150 } else {
5151 assert(kind == PyUnicode_4BYTE_KIND);
5152 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005153 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005154 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005155 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005156 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005157
Antoine Pitrou63065d72012-05-15 23:48:04 +02005158 switch (ch)
5159 {
5160 case 0:
5161 /* remaining byte at the end? (size should be even) */
5162 if (q == e || consumed)
5163 goto End;
5164 errmsg = "truncated data";
5165 startinpos = ((const char *)q) - starts;
5166 endinpos = ((const char *)e) - starts;
5167 break;
5168 /* The remaining input chars are ignored if the callback
5169 chooses to skip the input */
5170 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005171 q -= 2;
5172 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005173 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005174 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005175 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005176 endinpos = ((const char *)e) - starts;
5177 break;
5178 case 2:
5179 errmsg = "illegal encoding";
5180 startinpos = ((const char *)q) - 2 - starts;
5181 endinpos = startinpos + 2;
5182 break;
5183 case 3:
5184 errmsg = "illegal UTF-16 surrogate";
5185 startinpos = ((const char *)q) - 4 - starts;
5186 endinpos = startinpos + 2;
5187 break;
5188 default:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005189 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005190 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005191 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
5192 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 continue;
5194 }
5195
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005196 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005197 errors,
5198 &errorHandler,
5199 "utf16", errmsg,
5200 &starts,
5201 (const char **)&e,
5202 &startinpos,
5203 &endinpos,
5204 &exc,
5205 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005206 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 }
5209
Antoine Pitrou63065d72012-05-15 23:48:04 +02005210End:
Walter Dörwald69652032004-09-07 20:24:22 +00005211 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005212 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005213
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 Py_XDECREF(errorHandler);
5215 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005216 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217
Benjamin Peterson29060642009-01-31 22:14:21 +00005218 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005219 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005220 Py_XDECREF(errorHandler);
5221 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 return NULL;
5223}
5224
Tim Peters772747b2001-08-09 22:21:55 +00005225PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005226_PyUnicode_EncodeUTF16(PyObject *str,
5227 const char *errors,
5228 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005230 enum PyUnicode_Kind kind;
5231 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005232 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005233 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005234 unsigned short *out;
5235 Py_ssize_t bytesize;
5236 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005237#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005238 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005239#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005240 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005241#endif
5242
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005243 if (!PyUnicode_Check(str)) {
5244 PyErr_BadArgument();
5245 return NULL;
5246 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005247 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005248 return NULL;
5249 kind = PyUnicode_KIND(str);
5250 data = PyUnicode_DATA(str);
5251 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005252
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005253 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005254 if (kind == PyUnicode_4BYTE_KIND) {
5255 const Py_UCS4 *in = (const Py_UCS4 *)data;
5256 const Py_UCS4 *end = in + len;
5257 while (in < end)
5258 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005259 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005260 }
5261 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005263 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005264 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 if (v == NULL)
5266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005268 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005269 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005270 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005272 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005273 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005274 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005275
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005276 switch (kind) {
5277 case PyUnicode_1BYTE_KIND: {
5278 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5279 break;
Tim Peters772747b2001-08-09 22:21:55 +00005280 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005281 case PyUnicode_2BYTE_KIND: {
5282 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5283 break;
Tim Peters772747b2001-08-09 22:21:55 +00005284 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005285 case PyUnicode_4BYTE_KIND: {
5286 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5287 break;
5288 }
5289 default:
5290 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005291 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005292
5293 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005294 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295}
5296
Alexander Belopolsky40018472011-02-26 01:02:56 +00005297PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005298PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5299 Py_ssize_t size,
5300 const char *errors,
5301 int byteorder)
5302{
5303 PyObject *result;
5304 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5305 if (tmp == NULL)
5306 return NULL;
5307 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5308 Py_DECREF(tmp);
5309 return result;
5310}
5311
5312PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005313PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005315 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316}
5317
5318/* --- Unicode Escape Codec ----------------------------------------------- */
5319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005320/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5321 if all the escapes in the string make it still a valid ASCII string.
5322 Returns -1 if any escapes were found which cause the string to
5323 pop out of ASCII range. Otherwise returns the length of the
5324 required buffer to hold the string.
5325 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005326static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005327length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5328{
5329 const unsigned char *p = (const unsigned char *)s;
5330 const unsigned char *end = p + size;
5331 Py_ssize_t length = 0;
5332
5333 if (size < 0)
5334 return -1;
5335
5336 for (; p < end; ++p) {
5337 if (*p > 127) {
5338 /* Non-ASCII */
5339 return -1;
5340 }
5341 else if (*p != '\\') {
5342 /* Normal character */
5343 ++length;
5344 }
5345 else {
5346 /* Backslash-escape, check next char */
5347 ++p;
5348 /* Escape sequence reaches till end of string or
5349 non-ASCII follow-up. */
5350 if (p >= end || *p > 127)
5351 return -1;
5352 switch (*p) {
5353 case '\n':
5354 /* backslash + \n result in zero characters */
5355 break;
5356 case '\\': case '\'': case '\"':
5357 case 'b': case 'f': case 't':
5358 case 'n': case 'r': case 'v': case 'a':
5359 ++length;
5360 break;
5361 case '0': case '1': case '2': case '3':
5362 case '4': case '5': case '6': case '7':
5363 case 'x': case 'u': case 'U': case 'N':
5364 /* these do not guarantee ASCII characters */
5365 return -1;
5366 default:
5367 /* count the backslash + the other character */
5368 length += 2;
5369 }
5370 }
5371 }
5372 return length;
5373}
5374
Fredrik Lundh06d12682001-01-24 07:59:11 +00005375static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005376
Alexander Belopolsky40018472011-02-26 01:02:56 +00005377PyObject *
5378PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005379 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005380 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005382 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005383 Py_ssize_t startinpos;
5384 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005385 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005387 char* message;
5388 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005389 PyObject *errorHandler = NULL;
5390 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005391 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005392
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005393 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005394 if (len == 0)
5395 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005396
5397 /* After length_of_escaped_ascii_string() there are two alternatives,
5398 either the string is pure ASCII with named escapes like \n, etc.
5399 and we determined it's exact size (common case)
5400 or it contains \x, \u, ... escape sequences. then we create a
5401 legacy wchar string and resize it at the end of this function. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005402 _PyUnicodeWriter_Init(&writer, 0);
5403 if (len > 0) {
5404 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005405 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005406 assert(writer.kind == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005407 }
5408 else {
5409 /* Escaped strings will always be longer than the resulting
5410 Unicode string, so we start with size here and then reduce the
5411 length after conversion to the true value.
5412 (but if the error callback returns a long replacement string
5413 we'll have to allocate more space) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005414 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005415 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005416 }
5417
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005419 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005421
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 while (s < end) {
5423 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005424 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005425 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
5427 /* Non-escape characters are interpreted as Unicode ordinals */
5428 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005429 x = (unsigned char)*s;
5430 s++;
5431 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005432 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005433 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5434 writer.pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 continue;
5436 }
5437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005438 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 /* \ - Escapes */
5440 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005441 c = *s++;
5442 if (s > end)
5443 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005444
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005445 /* The only case in which i == ascii_length is a backslash
5446 followed by a newline. */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005447 assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005448
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005449 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005452#define WRITECHAR(ch) \
5453 do { \
5454 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1) \
5455 goto onError; \
5456 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch); \
5457 writer.pos++; \
5458 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005459
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005461 case '\\': WRITECHAR('\\'); break;
5462 case '\'': WRITECHAR('\''); break;
5463 case '\"': WRITECHAR('\"'); break;
5464 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005465 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005466 case 'f': WRITECHAR('\014'); break;
5467 case 't': WRITECHAR('\t'); break;
5468 case 'n': WRITECHAR('\n'); break;
5469 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005470 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005471 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005472 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005473 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
Benjamin Peterson29060642009-01-31 22:14:21 +00005475 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 case '0': case '1': case '2': case '3':
5477 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005478 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005479 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005480 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005481 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005482 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005484 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 break;
5486
Benjamin Peterson29060642009-01-31 22:14:21 +00005487 /* hex escapes */
5488 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005490 digits = 2;
5491 message = "truncated \\xXX escape";
5492 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005496 digits = 4;
5497 message = "truncated \\uXXXX escape";
5498 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005501 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005502 digits = 8;
5503 message = "truncated \\UXXXXXXXX escape";
5504 hexescape:
5505 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005506 if (end - s < digits) {
5507 /* count only hex digits */
5508 for (; s < end; ++s) {
5509 c = (unsigned char)*s;
5510 if (!Py_ISXDIGIT(c))
5511 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005512 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005513 goto error;
5514 }
5515 for (; digits--; ++s) {
5516 c = (unsigned char)*s;
5517 if (!Py_ISXDIGIT(c))
5518 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005519 chr = (chr<<4) & ~0xF;
5520 if (c >= '0' && c <= '9')
5521 chr += c - '0';
5522 else if (c >= 'a' && c <= 'f')
5523 chr += 10 + c - 'a';
5524 else
5525 chr += 10 + c - 'A';
5526 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005527 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005528 /* _decoding_error will have already written into the
5529 target buffer. */
5530 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005531 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005532 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005533 message = "illegal Unicode character";
5534 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005535 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005536 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005537 break;
5538
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005540 case 'N':
5541 message = "malformed \\N character escape";
5542 if (ucnhash_CAPI == NULL) {
5543 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005544 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5545 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005546 if (ucnhash_CAPI == NULL)
5547 goto ucnhashError;
5548 }
5549 if (*s == '{') {
5550 const char *start = s+1;
5551 /* look for the closing brace */
5552 while (*s != '}' && s < end)
5553 s++;
5554 if (s > start && s < end && *s == '}') {
5555 /* found a name. look it up in the unicode database */
5556 message = "unknown Unicode character name";
5557 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005558 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005559 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005560 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005561 goto store;
5562 }
5563 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005564 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005565
5566 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005567 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005568 message = "\\ at end of string";
5569 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005570 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005571 }
5572 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005573 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005574 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005575 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005576 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005578 continue;
5579
5580 error:
5581 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005582 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005583 errors, &errorHandler,
5584 "unicodeescape", message,
5585 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005586 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005587 goto onError;
5588 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005590#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005591
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005592 Py_XDECREF(errorHandler);
5593 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005594 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005595
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005597 PyErr_SetString(
5598 PyExc_UnicodeError,
5599 "\\N escapes not supported (can't load unicodedata module)"
5600 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005601 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005602 Py_XDECREF(errorHandler);
5603 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005604 return NULL;
5605
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005607 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005608 Py_XDECREF(errorHandler);
5609 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610 return NULL;
5611}
5612
5613/* Return a Unicode-Escape string version of the Unicode object.
5614
5615 If quotes is true, the string is enclosed in u"" or u'' quotes as
5616 appropriate.
5617
5618*/
5619
Alexander Belopolsky40018472011-02-26 01:02:56 +00005620PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005621PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005623 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005624 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005626 int kind;
5627 void *data;
5628 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629
Ezio Melottie7f90372012-10-05 03:33:31 +03005630 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005631 escape.
5632
Ezio Melottie7f90372012-10-05 03:33:31 +03005633 For UCS1 strings it's '\xxx', 4 bytes per source character.
5634 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5635 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005636 */
5637
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005638 if (!PyUnicode_Check(unicode)) {
5639 PyErr_BadArgument();
5640 return NULL;
5641 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005642 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 return NULL;
5644 len = PyUnicode_GET_LENGTH(unicode);
5645 kind = PyUnicode_KIND(unicode);
5646 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005647 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005648 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5649 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5650 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5651 }
5652
5653 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005654 return PyBytes_FromStringAndSize(NULL, 0);
5655
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005656 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005658
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005659 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005660 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005661 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005662 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 if (repr == NULL)
5664 return NULL;
5665
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005666 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005668 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005669 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005670
Walter Dörwald79e913e2007-05-12 11:08:06 +00005671 /* Escape backslashes */
5672 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 *p++ = '\\';
5674 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005675 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005676 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005677
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005678 /* Map 21-bit characters to '\U00xxxxxx' */
5679 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005680 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005681 *p++ = '\\';
5682 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005683 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5684 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5685 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5686 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5687 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5688 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5689 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5690 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005691 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005692 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005693
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005695 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696 *p++ = '\\';
5697 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005698 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5699 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5700 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5701 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005703
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005704 /* Map special whitespace to '\t', \n', '\r' */
5705 else if (ch == '\t') {
5706 *p++ = '\\';
5707 *p++ = 't';
5708 }
5709 else if (ch == '\n') {
5710 *p++ = '\\';
5711 *p++ = 'n';
5712 }
5713 else if (ch == '\r') {
5714 *p++ = '\\';
5715 *p++ = 'r';
5716 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005717
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005718 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005719 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005721 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005722 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5723 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005724 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005725
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 /* Copy everything else as-is */
5727 else
5728 *p++ = (char) ch;
5729 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005731 assert(p - PyBytes_AS_STRING(repr) > 0);
5732 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5733 return NULL;
5734 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735}
5736
Alexander Belopolsky40018472011-02-26 01:02:56 +00005737PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005738PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5739 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005741 PyObject *result;
5742 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5743 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005745 result = PyUnicode_AsUnicodeEscapeString(tmp);
5746 Py_DECREF(tmp);
5747 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748}
5749
5750/* --- Raw Unicode Escape Codec ------------------------------------------- */
5751
Alexander Belopolsky40018472011-02-26 01:02:56 +00005752PyObject *
5753PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005754 Py_ssize_t size,
5755 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005757 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005758 Py_ssize_t startinpos;
5759 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005760 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 const char *end;
5762 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763 PyObject *errorHandler = NULL;
5764 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005765
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005766 if (size == 0)
5767 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005768
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 /* Escaped strings will always be longer than the resulting
5770 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 length after conversion to the true value. (But decoding error
5772 handler might have to resize the string) */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005773 _PyUnicodeWriter_Init(&writer, 1);
5774 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005775 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005776
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777 end = s + size;
5778 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 unsigned char c;
5780 Py_UCS4 x;
5781 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005782 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783
Benjamin Peterson29060642009-01-31 22:14:21 +00005784 /* Non-escape characters are interpreted as Unicode ordinals */
5785 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005786 x = (unsigned char)*s++;
5787 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005788 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5790 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005792 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 startinpos = s-starts;
5794
5795 /* \u-escapes are only interpreted iff the number of leading
5796 backslashes if odd */
5797 bs = s;
5798 for (;s < end;) {
5799 if (*s != '\\')
5800 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005801 x = (unsigned char)*s++;
5802 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005803 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005804 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5805 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 }
5807 if (((s - bs) & 1) == 0 ||
5808 s >= end ||
5809 (*s != 'u' && *s != 'U')) {
5810 continue;
5811 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005812 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 count = *s=='u' ? 4 : 8;
5814 s++;
5815
5816 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 for (x = 0, i = 0; i < count; ++i, ++s) {
5818 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005819 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005821 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 errors, &errorHandler,
5823 "rawunicodeescape", "truncated \\uXXXX",
5824 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005825 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 goto onError;
5827 goto nextByte;
5828 }
5829 x = (x<<4) & ~0xF;
5830 if (c >= '0' && c <= '9')
5831 x += c - '0';
5832 else if (c >= 'a' && c <= 'f')
5833 x += 10 + c - 'a';
5834 else
5835 x += 10 + c - 'A';
5836 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005837 if (x <= MAX_UNICODE) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005838 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005839 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005840 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
5841 writer.pos++;
5842 }
5843 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00005844 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005845 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005846 errors, &errorHandler,
5847 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005849 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005850 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005851 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 nextByte:
5853 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855 Py_XDECREF(errorHandler);
5856 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005857 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00005858
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005860 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 Py_XDECREF(errorHandler);
5862 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 return NULL;
5864}
5865
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005866
Alexander Belopolsky40018472011-02-26 01:02:56 +00005867PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005868PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005870 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 char *p;
5872 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005873 Py_ssize_t expandsize, pos;
5874 int kind;
5875 void *data;
5876 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005878 if (!PyUnicode_Check(unicode)) {
5879 PyErr_BadArgument();
5880 return NULL;
5881 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005882 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883 return NULL;
5884 kind = PyUnicode_KIND(unicode);
5885 data = PyUnicode_DATA(unicode);
5886 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005887 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5888 bytes, and 1 byte characters 4. */
5889 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005890
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005893
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005894 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 if (repr == NULL)
5896 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005898 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005900 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005901 for (pos = 0; pos < len; pos++) {
5902 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 /* Map 32-bit characters to '\Uxxxxxxxx' */
5904 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005905 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005906 *p++ = '\\';
5907 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005908 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5909 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5910 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5911 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5912 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5913 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5914 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5915 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005916 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005918 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919 *p++ = '\\';
5920 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005921 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5922 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5923 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5924 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 /* Copy everything else as-is */
5927 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 *p++ = (char) ch;
5929 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005930
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005931 assert(p > q);
5932 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005933 return NULL;
5934 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935}
5936
Alexander Belopolsky40018472011-02-26 01:02:56 +00005937PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005938PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5939 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005941 PyObject *result;
5942 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5943 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005944 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005945 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5946 Py_DECREF(tmp);
5947 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948}
5949
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005950/* --- Unicode Internal Codec ------------------------------------------- */
5951
Alexander Belopolsky40018472011-02-26 01:02:56 +00005952PyObject *
5953_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005954 Py_ssize_t size,
5955 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005956{
5957 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005958 Py_ssize_t startinpos;
5959 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005960 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005961 const char *end;
5962 const char *reason;
5963 PyObject *errorHandler = NULL;
5964 PyObject *exc = NULL;
5965
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005966 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005967 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005968 1))
5969 return NULL;
5970
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005971 if (size == 0)
5972 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005973
Thomas Wouters89f507f2006-12-13 04:49:30 +00005974 /* XXX overflow detection missing */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005975 _PyUnicodeWriter_Init(&writer, 0);
5976 if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005978 end = s + size;
5979
5980 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005981 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005982 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02005983 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02005984 endinpos = end-starts;
5985 reason = "truncated input";
5986 goto error;
5987 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005988 /* We copy the raw representation one byte at a time because the
5989 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005990 ((char *) &uch)[0] = s[0];
5991 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005992#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005993 ((char *) &uch)[2] = s[2];
5994 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005995#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005996 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02005997#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005998 /* We have to sanity check the raw data, otherwise doom looms for
5999 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006000 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006001 endinpos = s - starts + Py_UNICODE_SIZE;
6002 reason = "illegal code point (> 0x10FFFF)";
6003 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006004 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006005#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006006 s += Py_UNICODE_SIZE;
6007#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006008 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006009 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006010 Py_UNICODE uch2;
6011 ((char *) &uch2)[0] = s[0];
6012 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006013 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006014 {
Victor Stinner551ac952011-11-29 22:58:13 +01006015 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006016 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006017 }
6018 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006019#endif
6020
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006021 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006022 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006023 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
6024 writer.pos++;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006025 continue;
6026
6027 error:
6028 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006029 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006030 errors, &errorHandler,
6031 "unicode_internal", reason,
6032 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006033 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006034 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006035 }
6036
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006037 Py_XDECREF(errorHandler);
6038 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006039 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006040
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006042 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006043 Py_XDECREF(errorHandler);
6044 Py_XDECREF(exc);
6045 return NULL;
6046}
6047
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048/* --- Latin-1 Codec ------------------------------------------------------ */
6049
Alexander Belopolsky40018472011-02-26 01:02:56 +00006050PyObject *
6051PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006052 Py_ssize_t size,
6053 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006056 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057}
6058
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006060static void
6061make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006062 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006063 PyObject *unicode,
6064 Py_ssize_t startpos, Py_ssize_t endpos,
6065 const char *reason)
6066{
6067 if (*exceptionObject == NULL) {
6068 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006069 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006070 encoding, unicode, startpos, endpos, reason);
6071 }
6072 else {
6073 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6074 goto onError;
6075 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6076 goto onError;
6077 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6078 goto onError;
6079 return;
6080 onError:
6081 Py_DECREF(*exceptionObject);
6082 *exceptionObject = NULL;
6083 }
6084}
6085
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006087static void
6088raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006089 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006090 PyObject *unicode,
6091 Py_ssize_t startpos, Py_ssize_t endpos,
6092 const char *reason)
6093{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006094 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006095 encoding, unicode, startpos, endpos, reason);
6096 if (*exceptionObject != NULL)
6097 PyCodec_StrictErrors(*exceptionObject);
6098}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006099
6100/* error handling callback helper:
6101 build arguments, call the callback and check the arguments,
6102 put the result into newpos and return the replacement string, which
6103 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006104static PyObject *
6105unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006106 PyObject **errorHandler,
6107 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006108 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006109 Py_ssize_t startpos, Py_ssize_t endpos,
6110 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006112 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006113 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114 PyObject *restuple;
6115 PyObject *resunicode;
6116
6117 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 }
6122
Benjamin Petersonbac79492012-01-14 13:34:47 -05006123 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006124 return NULL;
6125 len = PyUnicode_GET_LENGTH(unicode);
6126
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006127 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006128 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131
6132 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006136 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006137 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 Py_DECREF(restuple);
6139 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006141 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006142 &resunicode, newpos)) {
6143 Py_DECREF(restuple);
6144 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006146 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6147 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6148 Py_DECREF(restuple);
6149 return NULL;
6150 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006151 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006152 *newpos = len + *newpos;
6153 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6155 Py_DECREF(restuple);
6156 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006157 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006158 Py_INCREF(resunicode);
6159 Py_DECREF(restuple);
6160 return resunicode;
6161}
6162
Alexander Belopolsky40018472011-02-26 01:02:56 +00006163static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006164unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006165 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006166 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006168 /* input state */
6169 Py_ssize_t pos=0, size;
6170 int kind;
6171 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006172 /* output object */
6173 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174 /* pointer into the output */
6175 char *str;
6176 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006177 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006178 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6179 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006180 PyObject *errorHandler = NULL;
6181 PyObject *exc = NULL;
6182 /* the following variable is used for caching string comparisons
6183 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6184 int known_errorHandler = -1;
6185
Benjamin Petersonbac79492012-01-14 13:34:47 -05006186 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006187 return NULL;
6188 size = PyUnicode_GET_LENGTH(unicode);
6189 kind = PyUnicode_KIND(unicode);
6190 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006191 /* allocate enough for a simple encoding without
6192 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006193 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006194 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006195 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006196 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006197 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006198 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006199 ressize = size;
6200
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006201 while (pos < size) {
6202 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006203
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* can we encode this? */
6205 if (c<limit) {
6206 /* no overflow check, because we know that the space is enough */
6207 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006208 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006209 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 Py_ssize_t requiredsize;
6212 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006213 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006215 Py_ssize_t collstart = pos;
6216 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006217 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006218 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 ++collend;
6220 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6221 if (known_errorHandler==-1) {
6222 if ((errors==NULL) || (!strcmp(errors, "strict")))
6223 known_errorHandler = 1;
6224 else if (!strcmp(errors, "replace"))
6225 known_errorHandler = 2;
6226 else if (!strcmp(errors, "ignore"))
6227 known_errorHandler = 3;
6228 else if (!strcmp(errors, "xmlcharrefreplace"))
6229 known_errorHandler = 4;
6230 else
6231 known_errorHandler = 0;
6232 }
6233 switch (known_errorHandler) {
6234 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006235 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 goto onError;
6237 case 2: /* replace */
6238 while (collstart++<collend)
6239 *str++ = '?'; /* fall through */
6240 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006241 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 break;
6243 case 4: /* xmlcharrefreplace */
6244 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006245 /* determine replacement size */
6246 for (i = collstart, repsize = 0; i < collend; ++i) {
6247 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6248 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006250 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006252 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006254 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006256 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006258 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006260 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006261 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006263 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006265 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006266 if (requiredsize > ressize) {
6267 if (requiredsize<2*ressize)
6268 requiredsize = 2*ressize;
6269 if (_PyBytes_Resize(&res, requiredsize))
6270 goto onError;
6271 str = PyBytes_AS_STRING(res) + respos;
6272 ressize = requiredsize;
6273 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006274 /* generate replacement */
6275 for (i = collstart; i < collend; ++i) {
6276 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006278 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 break;
6280 default:
6281 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006282 encoding, reason, unicode, &exc,
6283 collstart, collend, &newpos);
6284 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006285 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006287 if (PyBytes_Check(repunicode)) {
6288 /* Directly copy bytes result to output. */
6289 repsize = PyBytes_Size(repunicode);
6290 if (repsize > 1) {
6291 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006292 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006293 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6294 Py_DECREF(repunicode);
6295 goto onError;
6296 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006297 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006298 ressize += repsize-1;
6299 }
6300 memcpy(str, PyBytes_AsString(repunicode), repsize);
6301 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006302 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006303 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006304 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006305 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 /* need more space? (at least enough for what we
6307 have+the replacement+the rest of the string, so
6308 we won't have to check space for encodable characters) */
6309 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006310 repsize = PyUnicode_GET_LENGTH(repunicode);
6311 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 if (requiredsize > ressize) {
6313 if (requiredsize<2*ressize)
6314 requiredsize = 2*ressize;
6315 if (_PyBytes_Resize(&res, requiredsize)) {
6316 Py_DECREF(repunicode);
6317 goto onError;
6318 }
6319 str = PyBytes_AS_STRING(res) + respos;
6320 ressize = requiredsize;
6321 }
6322 /* check if there is anything unencodable in the replacement
6323 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006324 for (i = 0; repsize-->0; ++i, ++str) {
6325 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006327 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006328 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 Py_DECREF(repunicode);
6330 goto onError;
6331 }
6332 *str = (char)c;
6333 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006334 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006335 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006337 }
6338 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006339 /* Resize if we allocated to much */
6340 size = str - PyBytes_AS_STRING(res);
6341 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006342 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006343 if (_PyBytes_Resize(&res, size) < 0)
6344 goto onError;
6345 }
6346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347 Py_XDECREF(errorHandler);
6348 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006349 return res;
6350
6351 onError:
6352 Py_XDECREF(res);
6353 Py_XDECREF(errorHandler);
6354 Py_XDECREF(exc);
6355 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356}
6357
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006358/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006359PyObject *
6360PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006361 Py_ssize_t size,
6362 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006363{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006364 PyObject *result;
6365 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6366 if (unicode == NULL)
6367 return NULL;
6368 result = unicode_encode_ucs1(unicode, errors, 256);
6369 Py_DECREF(unicode);
6370 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371}
6372
Alexander Belopolsky40018472011-02-26 01:02:56 +00006373PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006374_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
6376 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 PyErr_BadArgument();
6378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006380 if (PyUnicode_READY(unicode) == -1)
6381 return NULL;
6382 /* Fast path: if it is a one-byte string, construct
6383 bytes object directly. */
6384 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6385 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6386 PyUnicode_GET_LENGTH(unicode));
6387 /* Non-Latin-1 characters present. Defer to above function to
6388 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006389 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006390}
6391
6392PyObject*
6393PyUnicode_AsLatin1String(PyObject *unicode)
6394{
6395 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006396}
6397
6398/* --- 7-bit ASCII Codec -------------------------------------------------- */
6399
Alexander Belopolsky40018472011-02-26 01:02:56 +00006400PyObject *
6401PyUnicode_DecodeASCII(const char *s,
6402 Py_ssize_t size,
6403 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006406 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006407 int kind;
6408 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006409 Py_ssize_t startinpos;
6410 Py_ssize_t endinpos;
6411 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 const char *e;
6413 PyObject *errorHandler = NULL;
6414 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006415
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006417 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006418
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006420 if (size == 1 && (unsigned char)s[0] < 128)
6421 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006422
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006423 _PyUnicodeWriter_Init(&writer, 0);
6424 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006427 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006428 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006429 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006430 writer.pos = outpos;
6431 if (writer.pos == size)
6432 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006433
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006434 s += writer.pos;
6435 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 register unsigned char c = (unsigned char)*s;
6438 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006439 PyUnicode_WRITE(kind, data, writer.pos, c);
6440 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 ++s;
6442 }
6443 else {
6444 startinpos = s-starts;
6445 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006446 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 errors, &errorHandler,
6448 "ascii", "ordinal not in range(128)",
6449 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006450 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006452 kind = writer.kind;
6453 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456 Py_XDECREF(errorHandler);
6457 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006458 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006459
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006461 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462 Py_XDECREF(errorHandler);
6463 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 return NULL;
6465}
6466
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006468PyObject *
6469PyUnicode_EncodeASCII(const Py_UNICODE *p,
6470 Py_ssize_t size,
6471 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006473 PyObject *result;
6474 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6475 if (unicode == NULL)
6476 return NULL;
6477 result = unicode_encode_ucs1(unicode, errors, 128);
6478 Py_DECREF(unicode);
6479 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480}
6481
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006483_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484{
6485 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 PyErr_BadArgument();
6487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006489 if (PyUnicode_READY(unicode) == -1)
6490 return NULL;
6491 /* Fast path: if it is an ASCII-only string, construct bytes object
6492 directly. Else defer to above function to raise the exception. */
6493 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6494 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6495 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006496 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006497}
6498
6499PyObject *
6500PyUnicode_AsASCIIString(PyObject *unicode)
6501{
6502 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503}
6504
Victor Stinner99b95382011-07-04 14:23:54 +02006505#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006506
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006507/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006508
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006509#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006510#define NEED_RETRY
6511#endif
6512
Victor Stinner3a50e702011-10-18 21:21:00 +02006513#ifndef WC_ERR_INVALID_CHARS
6514# define WC_ERR_INVALID_CHARS 0x0080
6515#endif
6516
6517static char*
6518code_page_name(UINT code_page, PyObject **obj)
6519{
6520 *obj = NULL;
6521 if (code_page == CP_ACP)
6522 return "mbcs";
6523 if (code_page == CP_UTF7)
6524 return "CP_UTF7";
6525 if (code_page == CP_UTF8)
6526 return "CP_UTF8";
6527
6528 *obj = PyBytes_FromFormat("cp%u", code_page);
6529 if (*obj == NULL)
6530 return NULL;
6531 return PyBytes_AS_STRING(*obj);
6532}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006533
Alexander Belopolsky40018472011-02-26 01:02:56 +00006534static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006535is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006536{
6537 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006538 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006539
Victor Stinner3a50e702011-10-18 21:21:00 +02006540 if (!IsDBCSLeadByteEx(code_page, *curr))
6541 return 0;
6542
6543 prev = CharPrevExA(code_page, s, curr, 0);
6544 if (prev == curr)
6545 return 1;
6546 /* FIXME: This code is limited to "true" double-byte encodings,
6547 as it assumes an incomplete character consists of a single
6548 byte. */
6549 if (curr - prev == 2)
6550 return 1;
6551 if (!IsDBCSLeadByteEx(code_page, *prev))
6552 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006553 return 0;
6554}
6555
Victor Stinner3a50e702011-10-18 21:21:00 +02006556static DWORD
6557decode_code_page_flags(UINT code_page)
6558{
6559 if (code_page == CP_UTF7) {
6560 /* The CP_UTF7 decoder only supports flags=0 */
6561 return 0;
6562 }
6563 else
6564 return MB_ERR_INVALID_CHARS;
6565}
6566
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006567/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006568 * Decode a byte string from a Windows code page into unicode object in strict
6569 * mode.
6570 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006571 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6572 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006574static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006575decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006576 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006577 const char *in,
6578 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006579{
Victor Stinner3a50e702011-10-18 21:21:00 +02006580 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006581 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006582 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006583
6584 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006585 assert(insize > 0);
6586 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6587 if (outsize <= 0)
6588 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006589
6590 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006592 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006593 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 if (*v == NULL)
6595 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006596 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006597 }
6598 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006600 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006601 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006603 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006604 }
6605
6606 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006607 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6608 if (outsize <= 0)
6609 goto error;
6610 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006611
Victor Stinner3a50e702011-10-18 21:21:00 +02006612error:
6613 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6614 return -2;
6615 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006616 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006617}
6618
Victor Stinner3a50e702011-10-18 21:21:00 +02006619/*
6620 * Decode a byte string from a code page into unicode object with an error
6621 * handler.
6622 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006623 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006624 * UnicodeDecodeError exception and returns -1 on error.
6625 */
6626static int
6627decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006628 PyObject **v,
6629 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006630 const char *errors)
6631{
6632 const char *startin = in;
6633 const char *endin = in + size;
6634 const DWORD flags = decode_code_page_flags(code_page);
6635 /* Ideally, we should get reason from FormatMessage. This is the Windows
6636 2000 English version of the message. */
6637 const char *reason = "No mapping for the Unicode character exists "
6638 "in the target code page.";
6639 /* each step cannot decode more than 1 character, but a character can be
6640 represented as a surrogate pair */
6641 wchar_t buffer[2], *startout, *out;
6642 int insize, outsize;
6643 PyObject *errorHandler = NULL;
6644 PyObject *exc = NULL;
6645 PyObject *encoding_obj = NULL;
6646 char *encoding;
6647 DWORD err;
6648 int ret = -1;
6649
6650 assert(size > 0);
6651
6652 encoding = code_page_name(code_page, &encoding_obj);
6653 if (encoding == NULL)
6654 return -1;
6655
6656 if (errors == NULL || strcmp(errors, "strict") == 0) {
6657 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6658 UnicodeDecodeError. */
6659 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6660 if (exc != NULL) {
6661 PyCodec_StrictErrors(exc);
6662 Py_CLEAR(exc);
6663 }
6664 goto error;
6665 }
6666
6667 if (*v == NULL) {
6668 /* Create unicode object */
6669 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6670 PyErr_NoMemory();
6671 goto error;
6672 }
Victor Stinnerab595942011-12-17 04:59:06 +01006673 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006674 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006675 if (*v == NULL)
6676 goto error;
6677 startout = PyUnicode_AS_UNICODE(*v);
6678 }
6679 else {
6680 /* Extend unicode object */
6681 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6682 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6683 PyErr_NoMemory();
6684 goto error;
6685 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006686 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006687 goto error;
6688 startout = PyUnicode_AS_UNICODE(*v) + n;
6689 }
6690
6691 /* Decode the byte string character per character */
6692 out = startout;
6693 while (in < endin)
6694 {
6695 /* Decode a character */
6696 insize = 1;
6697 do
6698 {
6699 outsize = MultiByteToWideChar(code_page, flags,
6700 in, insize,
6701 buffer, Py_ARRAY_LENGTH(buffer));
6702 if (outsize > 0)
6703 break;
6704 err = GetLastError();
6705 if (err != ERROR_NO_UNICODE_TRANSLATION
6706 && err != ERROR_INSUFFICIENT_BUFFER)
6707 {
6708 PyErr_SetFromWindowsErr(0);
6709 goto error;
6710 }
6711 insize++;
6712 }
6713 /* 4=maximum length of a UTF-8 sequence */
6714 while (insize <= 4 && (in + insize) <= endin);
6715
6716 if (outsize <= 0) {
6717 Py_ssize_t startinpos, endinpos, outpos;
6718
6719 startinpos = in - startin;
6720 endinpos = startinpos + 1;
6721 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006722 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006723 errors, &errorHandler,
6724 encoding, reason,
6725 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006726 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006727 {
6728 goto error;
6729 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006730 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006731 }
6732 else {
6733 in += insize;
6734 memcpy(out, buffer, outsize * sizeof(wchar_t));
6735 out += outsize;
6736 }
6737 }
6738
6739 /* write a NUL character at the end */
6740 *out = 0;
6741
6742 /* Extend unicode object */
6743 outsize = out - startout;
6744 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006745 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006746 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006747 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006748
6749error:
6750 Py_XDECREF(encoding_obj);
6751 Py_XDECREF(errorHandler);
6752 Py_XDECREF(exc);
6753 return ret;
6754}
6755
Victor Stinner3a50e702011-10-18 21:21:00 +02006756static PyObject *
6757decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006758 const char *s, Py_ssize_t size,
6759 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006760{
Victor Stinner76a31a62011-11-04 00:05:13 +01006761 PyObject *v = NULL;
6762 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006763
Victor Stinner3a50e702011-10-18 21:21:00 +02006764 if (code_page < 0) {
6765 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6766 return NULL;
6767 }
6768
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006769 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771
Victor Stinner76a31a62011-11-04 00:05:13 +01006772 do
6773 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006774#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006775 if (size > INT_MAX) {
6776 chunk_size = INT_MAX;
6777 final = 0;
6778 done = 0;
6779 }
6780 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006781#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006782 {
6783 chunk_size = (int)size;
6784 final = (consumed == NULL);
6785 done = 1;
6786 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006787
Victor Stinner76a31a62011-11-04 00:05:13 +01006788 /* Skip trailing lead-byte unless 'final' is set */
6789 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6790 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006791
Victor Stinner76a31a62011-11-04 00:05:13 +01006792 if (chunk_size == 0 && done) {
6793 if (v != NULL)
6794 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006795 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006796 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006797
Victor Stinner76a31a62011-11-04 00:05:13 +01006798
6799 converted = decode_code_page_strict(code_page, &v,
6800 s, chunk_size);
6801 if (converted == -2)
6802 converted = decode_code_page_errors(code_page, &v,
6803 s, chunk_size,
6804 errors);
6805 assert(converted != 0);
6806
6807 if (converted < 0) {
6808 Py_XDECREF(v);
6809 return NULL;
6810 }
6811
6812 if (consumed)
6813 *consumed += converted;
6814
6815 s += converted;
6816 size -= converted;
6817 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006818
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006819 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006820}
6821
Alexander Belopolsky40018472011-02-26 01:02:56 +00006822PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006823PyUnicode_DecodeCodePageStateful(int code_page,
6824 const char *s,
6825 Py_ssize_t size,
6826 const char *errors,
6827 Py_ssize_t *consumed)
6828{
6829 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6830}
6831
6832PyObject *
6833PyUnicode_DecodeMBCSStateful(const char *s,
6834 Py_ssize_t size,
6835 const char *errors,
6836 Py_ssize_t *consumed)
6837{
6838 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6839}
6840
6841PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006842PyUnicode_DecodeMBCS(const char *s,
6843 Py_ssize_t size,
6844 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006845{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006846 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6847}
6848
Victor Stinner3a50e702011-10-18 21:21:00 +02006849static DWORD
6850encode_code_page_flags(UINT code_page, const char *errors)
6851{
6852 if (code_page == CP_UTF8) {
6853 if (winver.dwMajorVersion >= 6)
6854 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6855 and later */
6856 return WC_ERR_INVALID_CHARS;
6857 else
6858 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6859 return 0;
6860 }
6861 else if (code_page == CP_UTF7) {
6862 /* CP_UTF7 only supports flags=0 */
6863 return 0;
6864 }
6865 else {
6866 if (errors != NULL && strcmp(errors, "replace") == 0)
6867 return 0;
6868 else
6869 return WC_NO_BEST_FIT_CHARS;
6870 }
6871}
6872
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006873/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006874 * Encode a Unicode string to a Windows code page into a byte string in strict
6875 * mode.
6876 *
6877 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006878 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006879 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006880static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006881encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006882 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006884{
Victor Stinner554f3f02010-06-16 23:33:54 +00006885 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006886 BOOL *pusedDefaultChar = &usedDefaultChar;
6887 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006888 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006889 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006890 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006891 const DWORD flags = encode_code_page_flags(code_page, NULL);
6892 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006893 /* Create a substring so that we can get the UTF-16 representation
6894 of just the slice under consideration. */
6895 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006896
Martin v. Löwis3d325192011-11-04 18:23:06 +01006897 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006898
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006900 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006901 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006902 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006903
Victor Stinner2fc507f2011-11-04 20:06:39 +01006904 substring = PyUnicode_Substring(unicode, offset, offset+len);
6905 if (substring == NULL)
6906 return -1;
6907 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6908 if (p == NULL) {
6909 Py_DECREF(substring);
6910 return -1;
6911 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006912
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006913 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006914 outsize = WideCharToMultiByte(code_page, flags,
6915 p, size,
6916 NULL, 0,
6917 NULL, pusedDefaultChar);
6918 if (outsize <= 0)
6919 goto error;
6920 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006921 if (pusedDefaultChar && *pusedDefaultChar) {
6922 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006923 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006924 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006925
Victor Stinner3a50e702011-10-18 21:21:00 +02006926 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006927 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006928 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006929 if (*outbytes == NULL) {
6930 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006931 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006932 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006933 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934 }
6935 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006936 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006937 const Py_ssize_t n = PyBytes_Size(*outbytes);
6938 if (outsize > PY_SSIZE_T_MAX - n) {
6939 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006940 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006941 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006942 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006943 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6944 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006945 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006946 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006947 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948 }
6949
6950 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006951 outsize = WideCharToMultiByte(code_page, flags,
6952 p, size,
6953 out, outsize,
6954 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006955 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006956 if (outsize <= 0)
6957 goto error;
6958 if (pusedDefaultChar && *pusedDefaultChar)
6959 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006960 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006961
Victor Stinner3a50e702011-10-18 21:21:00 +02006962error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006963 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006964 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6965 return -2;
6966 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006967 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006968}
6969
Victor Stinner3a50e702011-10-18 21:21:00 +02006970/*
6971 * Encode a Unicode string to a Windows code page into a byte string using a
6972 * error handler.
6973 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006974 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 * -1 on other error.
6976 */
6977static int
6978encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01006979 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006980 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006981{
Victor Stinner3a50e702011-10-18 21:21:00 +02006982 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006983 Py_ssize_t pos = unicode_offset;
6984 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006985 /* Ideally, we should get reason from FormatMessage. This is the Windows
6986 2000 English version of the message. */
6987 const char *reason = "invalid character";
6988 /* 4=maximum length of a UTF-8 sequence */
6989 char buffer[4];
6990 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
6991 Py_ssize_t outsize;
6992 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006993 PyObject *errorHandler = NULL;
6994 PyObject *exc = NULL;
6995 PyObject *encoding_obj = NULL;
6996 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01006997 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006998 PyObject *rep;
6999 int ret = -1;
7000
7001 assert(insize > 0);
7002
7003 encoding = code_page_name(code_page, &encoding_obj);
7004 if (encoding == NULL)
7005 return -1;
7006
7007 if (errors == NULL || strcmp(errors, "strict") == 0) {
7008 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7009 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007010 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 if (exc != NULL) {
7012 PyCodec_StrictErrors(exc);
7013 Py_DECREF(exc);
7014 }
7015 Py_XDECREF(encoding_obj);
7016 return -1;
7017 }
7018
7019 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7020 pusedDefaultChar = &usedDefaultChar;
7021 else
7022 pusedDefaultChar = NULL;
7023
7024 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7025 PyErr_NoMemory();
7026 goto error;
7027 }
7028 outsize = insize * Py_ARRAY_LENGTH(buffer);
7029
7030 if (*outbytes == NULL) {
7031 /* Create string object */
7032 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7033 if (*outbytes == NULL)
7034 goto error;
7035 out = PyBytes_AS_STRING(*outbytes);
7036 }
7037 else {
7038 /* Extend string object */
7039 Py_ssize_t n = PyBytes_Size(*outbytes);
7040 if (n > PY_SSIZE_T_MAX - outsize) {
7041 PyErr_NoMemory();
7042 goto error;
7043 }
7044 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7045 goto error;
7046 out = PyBytes_AS_STRING(*outbytes) + n;
7047 }
7048
7049 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007050 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007051 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007052 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7053 wchar_t chars[2];
7054 int charsize;
7055 if (ch < 0x10000) {
7056 chars[0] = (wchar_t)ch;
7057 charsize = 1;
7058 }
7059 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007060 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7061 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007062 charsize = 2;
7063 }
7064
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007066 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007067 buffer, Py_ARRAY_LENGTH(buffer),
7068 NULL, pusedDefaultChar);
7069 if (outsize > 0) {
7070 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7071 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007072 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 memcpy(out, buffer, outsize);
7074 out += outsize;
7075 continue;
7076 }
7077 }
7078 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7079 PyErr_SetFromWindowsErr(0);
7080 goto error;
7081 }
7082
Victor Stinner3a50e702011-10-18 21:21:00 +02007083 rep = unicode_encode_call_errorhandler(
7084 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007085 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007086 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 if (rep == NULL)
7088 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007089 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007090
7091 if (PyBytes_Check(rep)) {
7092 outsize = PyBytes_GET_SIZE(rep);
7093 if (outsize != 1) {
7094 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7095 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7096 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7097 Py_DECREF(rep);
7098 goto error;
7099 }
7100 out = PyBytes_AS_STRING(*outbytes) + offset;
7101 }
7102 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7103 out += outsize;
7104 }
7105 else {
7106 Py_ssize_t i;
7107 enum PyUnicode_Kind kind;
7108 void *data;
7109
Benjamin Petersonbac79492012-01-14 13:34:47 -05007110 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 Py_DECREF(rep);
7112 goto error;
7113 }
7114
7115 outsize = PyUnicode_GET_LENGTH(rep);
7116 if (outsize != 1) {
7117 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7118 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7119 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7120 Py_DECREF(rep);
7121 goto error;
7122 }
7123 out = PyBytes_AS_STRING(*outbytes) + offset;
7124 }
7125 kind = PyUnicode_KIND(rep);
7126 data = PyUnicode_DATA(rep);
7127 for (i=0; i < outsize; i++) {
7128 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7129 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007130 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007131 encoding, unicode,
7132 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007133 "unable to encode error handler result to ASCII");
7134 Py_DECREF(rep);
7135 goto error;
7136 }
7137 *out = (unsigned char)ch;
7138 out++;
7139 }
7140 }
7141 Py_DECREF(rep);
7142 }
7143 /* write a NUL byte */
7144 *out = 0;
7145 outsize = out - PyBytes_AS_STRING(*outbytes);
7146 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7147 if (_PyBytes_Resize(outbytes, outsize) < 0)
7148 goto error;
7149 ret = 0;
7150
7151error:
7152 Py_XDECREF(encoding_obj);
7153 Py_XDECREF(errorHandler);
7154 Py_XDECREF(exc);
7155 return ret;
7156}
7157
Victor Stinner3a50e702011-10-18 21:21:00 +02007158static PyObject *
7159encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007160 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 const char *errors)
7162{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007163 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007165 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007166 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007167
Benjamin Petersonbac79492012-01-14 13:34:47 -05007168 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007169 return NULL;
7170 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007171
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 if (code_page < 0) {
7173 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7174 return NULL;
7175 }
7176
Martin v. Löwis3d325192011-11-04 18:23:06 +01007177 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007178 return PyBytes_FromStringAndSize(NULL, 0);
7179
Victor Stinner7581cef2011-11-03 22:32:33 +01007180 offset = 0;
7181 do
7182 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007184 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007185 chunks. */
7186 if (len > INT_MAX/2) {
7187 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007188 done = 0;
7189 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007190 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007191#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007192 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007193 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007194 done = 1;
7195 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007196
Victor Stinner76a31a62011-11-04 00:05:13 +01007197 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007198 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007199 errors);
7200 if (ret == -2)
7201 ret = encode_code_page_errors(code_page, &outbytes,
7202 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007203 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007204 if (ret < 0) {
7205 Py_XDECREF(outbytes);
7206 return NULL;
7207 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007208
Victor Stinner7581cef2011-11-03 22:32:33 +01007209 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007210 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007211 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007212
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 return outbytes;
7214}
7215
7216PyObject *
7217PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7218 Py_ssize_t size,
7219 const char *errors)
7220{
Victor Stinner7581cef2011-11-03 22:32:33 +01007221 PyObject *unicode, *res;
7222 unicode = PyUnicode_FromUnicode(p, size);
7223 if (unicode == NULL)
7224 return NULL;
7225 res = encode_code_page(CP_ACP, unicode, errors);
7226 Py_DECREF(unicode);
7227 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007228}
7229
7230PyObject *
7231PyUnicode_EncodeCodePage(int code_page,
7232 PyObject *unicode,
7233 const char *errors)
7234{
Victor Stinner7581cef2011-11-03 22:32:33 +01007235 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007236}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007237
Alexander Belopolsky40018472011-02-26 01:02:56 +00007238PyObject *
7239PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007240{
7241 if (!PyUnicode_Check(unicode)) {
7242 PyErr_BadArgument();
7243 return NULL;
7244 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007245 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007246}
7247
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007248#undef NEED_RETRY
7249
Victor Stinner99b95382011-07-04 14:23:54 +02007250#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007251
Guido van Rossumd57fd912000-03-10 22:53:23 +00007252/* --- Character Mapping Codec -------------------------------------------- */
7253
Alexander Belopolsky40018472011-02-26 01:02:56 +00007254PyObject *
7255PyUnicode_DecodeCharmap(const char *s,
7256 Py_ssize_t size,
7257 PyObject *mapping,
7258 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007259{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007260 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007261 Py_ssize_t startinpos;
7262 Py_ssize_t endinpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007263 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007264 _PyUnicodeWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007265 PyObject *errorHandler = NULL;
7266 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007267
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 /* Default to Latin-1 */
7269 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007273 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007274 _PyUnicodeWriter_Init(&writer, 0);
7275 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007276 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007277
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007279 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007280 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007281 enum PyUnicode_Kind mapkind;
7282 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007283 Py_UCS4 x;
7284
Benjamin Petersonbac79492012-01-14 13:34:47 -05007285 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007286 return NULL;
7287
7288 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007289 mapdata = PyUnicode_DATA(mapping);
7290 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007292 unsigned char ch;
7293 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007294 enum PyUnicode_Kind outkind = writer.kind;
7295 void *outdata = writer.data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007296 if (outkind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007297 Py_UCS4 maxchar = writer.maxchar;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007298 while (s < e) {
7299 unsigned char ch = *s;
7300 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7301 if (x > maxchar)
7302 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007303 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, writer.pos, x);
7304 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007305 ++s;
7306 }
7307 break;
7308 }
7309 else if (outkind == PyUnicode_2BYTE_KIND) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007310 while (s < e) {
7311 unsigned char ch = *s;
7312 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7313 if (x == 0xFFFE)
7314 goto Error;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007315 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, writer.pos, x);
7316 writer.pos++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007317 ++s;
7318 }
7319 break;
7320 }
7321 }
7322 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007323
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007325 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007326 else
7327 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007328Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007329 if (x == 0xfffe)
7330 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 startinpos = s-starts;
7333 endinpos = startinpos+1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007334 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 errors, &errorHandler,
7336 "charmap", "character maps to <undefined>",
7337 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007338 &writer)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 goto onError;
7340 }
7341 continue;
7342 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007343
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007344 if (_PyUnicodeWriter_Prepare(&writer, 1, x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007345 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007346 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, x);
7347 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007349 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007350 }
7351 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 while (s < e) {
7353 unsigned char ch = *s;
7354 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007355
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7357 w = PyLong_FromLong((long)ch);
7358 if (w == NULL)
7359 goto onError;
7360 x = PyObject_GetItem(mapping, w);
7361 Py_DECREF(w);
7362 if (x == NULL) {
7363 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7364 /* No mapping found means: mapping is undefined. */
7365 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007366 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 } else
7368 goto onError;
7369 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007370
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007372 if (x == Py_None)
7373 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007374 if (PyLong_Check(x)) {
7375 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007376 if (value == 0xFFFE)
7377 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007378 if (value < 0 || value > MAX_UNICODE) {
7379 PyErr_Format(PyExc_TypeError,
7380 "character mapping must be in range(0x%lx)",
7381 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 Py_DECREF(x);
7383 goto onError;
7384 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007385
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007386 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007387 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007388 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007389 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007390 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7391 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007393 else if (PyUnicode_Check(x)) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007394 if (PyUnicode_READY(x) == -1) {
7395 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007396 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007397 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007398 if (PyUnicode_GET_LENGTH(x) == 1) {
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007399 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007400 if (value == 0xFFFE)
7401 goto Undefined;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007402 if (_PyUnicodeWriter_Prepare(&writer, 1, value) == -1) {
7403 Py_DECREF(x);
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007404 goto onError;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007405 }
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007406 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, value);
7407 writer.pos++;
7408 }
7409 else {
7410 writer.overallocate = 1;
Serhiy Storchaka2aee6a62013-01-29 12:16:57 +02007411 if (_PyUnicodeWriter_WriteStr(&writer, x) == -1) {
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007412 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007413 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007414 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007415 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007416 }
7417 else {
7418 /* wrong return value */
7419 PyErr_SetString(PyExc_TypeError,
7420 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007421 Py_DECREF(x);
7422 goto onError;
7423 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 Py_DECREF(x);
7425 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007426 continue;
7427Undefined:
7428 /* undefined mapping */
7429 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007430 startinpos = s-starts;
7431 endinpos = startinpos+1;
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007432 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007433 errors, &errorHandler,
7434 "charmap", "character maps to <undefined>",
7435 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka55e2cb42013-01-15 15:30:04 +02007436 &writer)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007437 goto onError;
7438 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007440 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007441 Py_XDECREF(errorHandler);
7442 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007443 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007444
Benjamin Peterson29060642009-01-31 22:14:21 +00007445 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007446 Py_XDECREF(errorHandler);
7447 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007448 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449 return NULL;
7450}
7451
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007452/* Charmap encoding: the lookup table */
7453
Alexander Belopolsky40018472011-02-26 01:02:56 +00007454struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 PyObject_HEAD
7456 unsigned char level1[32];
7457 int count2, count3;
7458 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007459};
7460
7461static PyObject*
7462encoding_map_size(PyObject *obj, PyObject* args)
7463{
7464 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007465 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007466 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007467}
7468
7469static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007470 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 PyDoc_STR("Return the size (in bytes) of this object") },
7472 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007473};
7474
7475static void
7476encoding_map_dealloc(PyObject* o)
7477{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007478 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007479}
7480
7481static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007482 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 "EncodingMap", /*tp_name*/
7484 sizeof(struct encoding_map), /*tp_basicsize*/
7485 0, /*tp_itemsize*/
7486 /* methods */
7487 encoding_map_dealloc, /*tp_dealloc*/
7488 0, /*tp_print*/
7489 0, /*tp_getattr*/
7490 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007491 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 0, /*tp_repr*/
7493 0, /*tp_as_number*/
7494 0, /*tp_as_sequence*/
7495 0, /*tp_as_mapping*/
7496 0, /*tp_hash*/
7497 0, /*tp_call*/
7498 0, /*tp_str*/
7499 0, /*tp_getattro*/
7500 0, /*tp_setattro*/
7501 0, /*tp_as_buffer*/
7502 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7503 0, /*tp_doc*/
7504 0, /*tp_traverse*/
7505 0, /*tp_clear*/
7506 0, /*tp_richcompare*/
7507 0, /*tp_weaklistoffset*/
7508 0, /*tp_iter*/
7509 0, /*tp_iternext*/
7510 encoding_map_methods, /*tp_methods*/
7511 0, /*tp_members*/
7512 0, /*tp_getset*/
7513 0, /*tp_base*/
7514 0, /*tp_dict*/
7515 0, /*tp_descr_get*/
7516 0, /*tp_descr_set*/
7517 0, /*tp_dictoffset*/
7518 0, /*tp_init*/
7519 0, /*tp_alloc*/
7520 0, /*tp_new*/
7521 0, /*tp_free*/
7522 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007523};
7524
7525PyObject*
7526PyUnicode_BuildEncodingMap(PyObject* string)
7527{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007528 PyObject *result;
7529 struct encoding_map *mresult;
7530 int i;
7531 int need_dict = 0;
7532 unsigned char level1[32];
7533 unsigned char level2[512];
7534 unsigned char *mlevel1, *mlevel2, *mlevel3;
7535 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007536 int kind;
7537 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007538 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007539 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007540
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007541 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007542 PyErr_BadArgument();
7543 return NULL;
7544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007545 kind = PyUnicode_KIND(string);
7546 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007547 length = PyUnicode_GET_LENGTH(string);
7548 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007549 memset(level1, 0xFF, sizeof level1);
7550 memset(level2, 0xFF, sizeof level2);
7551
7552 /* If there isn't a one-to-one mapping of NULL to \0,
7553 or if there are non-BMP characters, we need to use
7554 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007555 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007556 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007557 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007558 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007559 ch = PyUnicode_READ(kind, data, i);
7560 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007561 need_dict = 1;
7562 break;
7563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007564 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007565 /* unmapped character */
7566 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007567 l1 = ch >> 11;
7568 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007569 if (level1[l1] == 0xFF)
7570 level1[l1] = count2++;
7571 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007572 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007573 }
7574
7575 if (count2 >= 0xFF || count3 >= 0xFF)
7576 need_dict = 1;
7577
7578 if (need_dict) {
7579 PyObject *result = PyDict_New();
7580 PyObject *key, *value;
7581 if (!result)
7582 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007583 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007584 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007585 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007586 if (!key || !value)
7587 goto failed1;
7588 if (PyDict_SetItem(result, key, value) == -1)
7589 goto failed1;
7590 Py_DECREF(key);
7591 Py_DECREF(value);
7592 }
7593 return result;
7594 failed1:
7595 Py_XDECREF(key);
7596 Py_XDECREF(value);
7597 Py_DECREF(result);
7598 return NULL;
7599 }
7600
7601 /* Create a three-level trie */
7602 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7603 16*count2 + 128*count3 - 1);
7604 if (!result)
7605 return PyErr_NoMemory();
7606 PyObject_Init(result, &EncodingMapType);
7607 mresult = (struct encoding_map*)result;
7608 mresult->count2 = count2;
7609 mresult->count3 = count3;
7610 mlevel1 = mresult->level1;
7611 mlevel2 = mresult->level23;
7612 mlevel3 = mresult->level23 + 16*count2;
7613 memcpy(mlevel1, level1, 32);
7614 memset(mlevel2, 0xFF, 16*count2);
7615 memset(mlevel3, 0, 128*count3);
7616 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007617 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007618 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007619 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7620 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007621 /* unmapped character */
7622 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007623 o1 = ch>>11;
7624 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007625 i2 = 16*mlevel1[o1] + o2;
7626 if (mlevel2[i2] == 0xFF)
7627 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007628 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007629 i3 = 128*mlevel2[i2] + o3;
7630 mlevel3[i3] = i;
7631 }
7632 return result;
7633}
7634
7635static int
Victor Stinner22168992011-11-20 17:09:18 +01007636encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007637{
7638 struct encoding_map *map = (struct encoding_map*)mapping;
7639 int l1 = c>>11;
7640 int l2 = (c>>7) & 0xF;
7641 int l3 = c & 0x7F;
7642 int i;
7643
Victor Stinner22168992011-11-20 17:09:18 +01007644 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007646 if (c == 0)
7647 return 0;
7648 /* level 1*/
7649 i = map->level1[l1];
7650 if (i == 0xFF) {
7651 return -1;
7652 }
7653 /* level 2*/
7654 i = map->level23[16*i+l2];
7655 if (i == 0xFF) {
7656 return -1;
7657 }
7658 /* level 3 */
7659 i = map->level23[16*map->count2 + 128*i + l3];
7660 if (i == 0) {
7661 return -1;
7662 }
7663 return i;
7664}
7665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007666/* Lookup the character ch in the mapping. If the character
7667 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007668 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007669static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007670charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007671{
Christian Heimes217cfd12007-12-02 14:31:20 +00007672 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007673 PyObject *x;
7674
7675 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007677 x = PyObject_GetItem(mapping, w);
7678 Py_DECREF(w);
7679 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7681 /* No mapping found means: mapping is undefined. */
7682 PyErr_Clear();
7683 x = Py_None;
7684 Py_INCREF(x);
7685 return x;
7686 } else
7687 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007688 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007689 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007690 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007691 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 long value = PyLong_AS_LONG(x);
7693 if (value < 0 || value > 255) {
7694 PyErr_SetString(PyExc_TypeError,
7695 "character mapping must be in range(256)");
7696 Py_DECREF(x);
7697 return NULL;
7698 }
7699 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007700 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007701 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007702 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007704 /* wrong return value */
7705 PyErr_Format(PyExc_TypeError,
7706 "character mapping must return integer, bytes or None, not %.400s",
7707 x->ob_type->tp_name);
7708 Py_DECREF(x);
7709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710 }
7711}
7712
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007714charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007715{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007716 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7717 /* exponentially overallocate to minimize reallocations */
7718 if (requiredsize < 2*outsize)
7719 requiredsize = 2*outsize;
7720 if (_PyBytes_Resize(outobj, requiredsize))
7721 return -1;
7722 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007723}
7724
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007727} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007728/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007729 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007730 space is available. Return a new reference to the object that
7731 was put in the output buffer, or Py_None, if the mapping was undefined
7732 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007733 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007734static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007735charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007736 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007737{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007738 PyObject *rep;
7739 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007740 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007741
Christian Heimes90aa7642007-12-19 02:45:37 +00007742 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007743 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007744 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007745 if (res == -1)
7746 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 if (outsize<requiredsize)
7748 if (charmapencode_resize(outobj, outpos, requiredsize))
7749 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007750 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 outstart[(*outpos)++] = (char)res;
7752 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007753 }
7754
7755 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007756 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 Py_DECREF(rep);
7760 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007761 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 if (PyLong_Check(rep)) {
7763 Py_ssize_t requiredsize = *outpos+1;
7764 if (outsize<requiredsize)
7765 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7766 Py_DECREF(rep);
7767 return enc_EXCEPTION;
7768 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007769 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007771 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 else {
7773 const char *repchars = PyBytes_AS_STRING(rep);
7774 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7775 Py_ssize_t requiredsize = *outpos+repsize;
7776 if (outsize<requiredsize)
7777 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7778 Py_DECREF(rep);
7779 return enc_EXCEPTION;
7780 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007781 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 memcpy(outstart + *outpos, repchars, repsize);
7783 *outpos += repsize;
7784 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007785 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007786 Py_DECREF(rep);
7787 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007788}
7789
7790/* handle an error in PyUnicode_EncodeCharmap
7791 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007792static int
7793charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007794 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007795 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007796 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007797 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007798{
7799 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007800 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007801 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007802 enum PyUnicode_Kind kind;
7803 void *data;
7804 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007805 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007806 Py_ssize_t collstartpos = *inpos;
7807 Py_ssize_t collendpos = *inpos+1;
7808 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007809 char *encoding = "charmap";
7810 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007811 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007812 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007813 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814
Benjamin Petersonbac79492012-01-14 13:34:47 -05007815 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007816 return -1;
7817 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007818 /* find all unencodable characters */
7819 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007821 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007822 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007823 val = encoding_map_lookup(ch, mapping);
7824 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007825 break;
7826 ++collendpos;
7827 continue;
7828 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007829
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007830 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7831 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 if (rep==NULL)
7833 return -1;
7834 else if (rep!=Py_None) {
7835 Py_DECREF(rep);
7836 break;
7837 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007838 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007840 }
7841 /* cache callback name lookup
7842 * (if not done yet, i.e. it's the first error) */
7843 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 if ((errors==NULL) || (!strcmp(errors, "strict")))
7845 *known_errorHandler = 1;
7846 else if (!strcmp(errors, "replace"))
7847 *known_errorHandler = 2;
7848 else if (!strcmp(errors, "ignore"))
7849 *known_errorHandler = 3;
7850 else if (!strcmp(errors, "xmlcharrefreplace"))
7851 *known_errorHandler = 4;
7852 else
7853 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007854 }
7855 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007856 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007857 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007858 return -1;
7859 case 2: /* replace */
7860 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007861 x = charmapencode_output('?', mapping, res, respos);
7862 if (x==enc_EXCEPTION) {
7863 return -1;
7864 }
7865 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007866 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 return -1;
7868 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007869 }
7870 /* fall through */
7871 case 3: /* ignore */
7872 *inpos = collendpos;
7873 break;
7874 case 4: /* xmlcharrefreplace */
7875 /* generate replacement (temporarily (mis)uses p) */
7876 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007877 char buffer[2+29+1+1];
7878 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007879 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007880 for (cp = buffer; *cp; ++cp) {
7881 x = charmapencode_output(*cp, mapping, res, respos);
7882 if (x==enc_EXCEPTION)
7883 return -1;
7884 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007885 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 return -1;
7887 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 }
7889 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007890 *inpos = collendpos;
7891 break;
7892 default:
7893 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007894 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007896 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007898 if (PyBytes_Check(repunicode)) {
7899 /* Directly copy bytes result to output. */
7900 Py_ssize_t outsize = PyBytes_Size(*res);
7901 Py_ssize_t requiredsize;
7902 repsize = PyBytes_Size(repunicode);
7903 requiredsize = *respos + repsize;
7904 if (requiredsize > outsize)
7905 /* Make room for all additional bytes. */
7906 if (charmapencode_resize(res, respos, requiredsize)) {
7907 Py_DECREF(repunicode);
7908 return -1;
7909 }
7910 memcpy(PyBytes_AsString(*res) + *respos,
7911 PyBytes_AsString(repunicode), repsize);
7912 *respos += repsize;
7913 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007914 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007915 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007916 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007917 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007918 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007919 Py_DECREF(repunicode);
7920 return -1;
7921 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007922 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007923 data = PyUnicode_DATA(repunicode);
7924 kind = PyUnicode_KIND(repunicode);
7925 for (index = 0; index < repsize; index++) {
7926 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7927 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007929 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 return -1;
7931 }
7932 else if (x==enc_FAILED) {
7933 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007934 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 return -1;
7936 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007937 }
7938 *inpos = newpos;
7939 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007940 }
7941 return 0;
7942}
7943
Alexander Belopolsky40018472011-02-26 01:02:56 +00007944PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007945_PyUnicode_EncodeCharmap(PyObject *unicode,
7946 PyObject *mapping,
7947 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007948{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949 /* output object */
7950 PyObject *res = NULL;
7951 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007952 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007953 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007954 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007955 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007956 PyObject *errorHandler = NULL;
7957 PyObject *exc = NULL;
7958 /* the following variable is used for caching string comparisons
7959 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7960 * 3=ignore, 4=xmlcharrefreplace */
7961 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007962
Benjamin Petersonbac79492012-01-14 13:34:47 -05007963 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007964 return NULL;
7965 size = PyUnicode_GET_LENGTH(unicode);
7966
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 /* Default to Latin-1 */
7968 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007969 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007970
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007971 /* allocate enough for a simple encoding without
7972 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007973 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007974 if (res == NULL)
7975 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007976 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007979 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007980 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007982 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 if (x==enc_EXCEPTION) /* error */
7984 goto onError;
7985 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007986 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 &exc,
7988 &known_errorHandler, &errorHandler, errors,
7989 &res, &respos)) {
7990 goto onError;
7991 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007992 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 else
7994 /* done with this character => adjust input position */
7995 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007997
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007998 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007999 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008000 if (_PyBytes_Resize(&res, respos) < 0)
8001 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008003 Py_XDECREF(exc);
8004 Py_XDECREF(errorHandler);
8005 return res;
8006
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008008 Py_XDECREF(res);
8009 Py_XDECREF(exc);
8010 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011 return NULL;
8012}
8013
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008014/* Deprecated */
8015PyObject *
8016PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8017 Py_ssize_t size,
8018 PyObject *mapping,
8019 const char *errors)
8020{
8021 PyObject *result;
8022 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8023 if (unicode == NULL)
8024 return NULL;
8025 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8026 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008027 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008028}
8029
Alexander Belopolsky40018472011-02-26 01:02:56 +00008030PyObject *
8031PyUnicode_AsCharmapString(PyObject *unicode,
8032 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033{
8034 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 PyErr_BadArgument();
8036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008037 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008038 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008039}
8040
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008041/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008042static void
8043make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008044 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008045 Py_ssize_t startpos, Py_ssize_t endpos,
8046 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008047{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008048 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008049 *exceptionObject = _PyUnicodeTranslateError_Create(
8050 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 }
8052 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8054 goto onError;
8055 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8056 goto onError;
8057 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8058 goto onError;
8059 return;
8060 onError:
8061 Py_DECREF(*exceptionObject);
8062 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063 }
8064}
8065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008066/* error handling callback helper:
8067 build arguments, call the callback and check the arguments,
8068 put the result into newpos and return the replacement string, which
8069 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008070static PyObject *
8071unicode_translate_call_errorhandler(const char *errors,
8072 PyObject **errorHandler,
8073 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008074 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008075 Py_ssize_t startpos, Py_ssize_t endpos,
8076 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008077{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008078 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008080 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081 PyObject *restuple;
8082 PyObject *resunicode;
8083
8084 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008086 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 }
8089
8090 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008091 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094
8095 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008100 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 Py_DECREF(restuple);
8102 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008103 }
8104 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 &resunicode, &i_newpos)) {
8106 Py_DECREF(restuple);
8107 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008109 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008110 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008111 else
8112 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8115 Py_DECREF(restuple);
8116 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008117 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008118 Py_INCREF(resunicode);
8119 Py_DECREF(restuple);
8120 return resunicode;
8121}
8122
8123/* Lookup the character ch in the mapping and put the result in result,
8124 which must be decrefed by the caller.
8125 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008126static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008127charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008128{
Christian Heimes217cfd12007-12-02 14:31:20 +00008129 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008130 PyObject *x;
8131
8132 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008134 x = PyObject_GetItem(mapping, w);
8135 Py_DECREF(w);
8136 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8138 /* No mapping found means: use 1:1 mapping. */
8139 PyErr_Clear();
8140 *result = NULL;
8141 return 0;
8142 } else
8143 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144 }
8145 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 *result = x;
8147 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008149 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 long value = PyLong_AS_LONG(x);
8151 long max = PyUnicode_GetMax();
8152 if (value < 0 || value > max) {
8153 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008154 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 Py_DECREF(x);
8156 return -1;
8157 }
8158 *result = x;
8159 return 0;
8160 }
8161 else if (PyUnicode_Check(x)) {
8162 *result = x;
8163 return 0;
8164 }
8165 else {
8166 /* wrong return value */
8167 PyErr_SetString(PyExc_TypeError,
8168 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 Py_DECREF(x);
8170 return -1;
8171 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008172}
8173/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 if not reallocate and adjust various state variables.
8175 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008176static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008177charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008179{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008181 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008182 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 /* exponentially overallocate to minimize reallocations */
8184 if (requiredsize < 2 * oldsize)
8185 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008186 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8187 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008189 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008190 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008191 }
8192 return 0;
8193}
8194/* lookup the character, put the result in the output string and adjust
8195 various state variables. Return a new reference to the object that
8196 was put in the output buffer in *result, or Py_None, if the mapping was
8197 undefined (in which case no character was written).
8198 The called must decref result.
8199 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008200static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008201charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8202 PyObject *mapping, Py_UCS4 **output,
8203 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008204 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008206 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8207 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008208 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212 }
8213 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008214 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008215 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008217 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008218 }
8219 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008220 Py_ssize_t repsize;
8221 if (PyUnicode_READY(*res) == -1)
8222 return -1;
8223 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 if (repsize==1) {
8225 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008226 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 }
8228 else if (repsize!=0) {
8229 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008230 Py_ssize_t requiredsize = *opos +
8231 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 Py_ssize_t i;
8234 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236 for(i = 0; i < repsize; i++)
8237 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008239 }
8240 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242 return 0;
8243}
8244
Alexander Belopolsky40018472011-02-26 01:02:56 +00008245PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008246_PyUnicode_TranslateCharmap(PyObject *input,
8247 PyObject *mapping,
8248 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 /* input object */
8251 char *idata;
8252 Py_ssize_t size, i;
8253 int kind;
8254 /* output buffer */
8255 Py_UCS4 *output = NULL;
8256 Py_ssize_t osize;
8257 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008259 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 char *reason = "character maps to <undefined>";
8261 PyObject *errorHandler = NULL;
8262 PyObject *exc = NULL;
8263 /* the following variable is used for caching string comparisons
8264 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8265 * 3=ignore, 4=xmlcharrefreplace */
8266 int known_errorHandler = -1;
8267
Guido van Rossumd57fd912000-03-10 22:53:23 +00008268 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 PyErr_BadArgument();
8270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008273 if (PyUnicode_READY(input) == -1)
8274 return NULL;
8275 idata = (char*)PyUnicode_DATA(input);
8276 kind = PyUnicode_KIND(input);
8277 size = PyUnicode_GET_LENGTH(input);
8278 i = 0;
8279
8280 if (size == 0) {
8281 Py_INCREF(input);
8282 return input;
8283 }
8284
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285 /* allocate enough for a simple 1:1 translation without
8286 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008287 osize = size;
8288 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8289 opos = 0;
8290 if (output == NULL) {
8291 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008293 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 /* try to encode it */
8297 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008298 if (charmaptranslate_output(input, i, mapping,
8299 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 Py_XDECREF(x);
8301 goto onError;
8302 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008303 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008305 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008306 else { /* untranslatable character */
8307 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8308 Py_ssize_t repsize;
8309 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008310 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 Py_ssize_t collstart = i;
8313 Py_ssize_t collend = i+1;
8314 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008317 while (collend < size) {
8318 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 goto onError;
8320 Py_XDECREF(x);
8321 if (x!=Py_None)
8322 break;
8323 ++collend;
8324 }
8325 /* cache callback name lookup
8326 * (if not done yet, i.e. it's the first error) */
8327 if (known_errorHandler==-1) {
8328 if ((errors==NULL) || (!strcmp(errors, "strict")))
8329 known_errorHandler = 1;
8330 else if (!strcmp(errors, "replace"))
8331 known_errorHandler = 2;
8332 else if (!strcmp(errors, "ignore"))
8333 known_errorHandler = 3;
8334 else if (!strcmp(errors, "xmlcharrefreplace"))
8335 known_errorHandler = 4;
8336 else
8337 known_errorHandler = 0;
8338 }
8339 switch (known_errorHandler) {
8340 case 1: /* strict */
Victor Stinner6fa62752012-10-23 02:51:50 +02008341 make_translate_exception(&exc,
8342 input, collstart, collend, reason);
8343 if (exc != NULL)
8344 PyCodec_StrictErrors(exc);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008345 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 case 2: /* replace */
8347 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 for (coll = collstart; coll<collend; coll++)
8349 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 /* fall through */
8351 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 break;
8354 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 /* generate replacement (temporarily (mis)uses i) */
8356 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 char buffer[2+29+1+1];
8358 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008359 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8360 if (charmaptranslate_makespace(&output, &osize,
8361 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 goto onError;
8363 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008366 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 break;
8368 default:
8369 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 reason, input, &exc,
8371 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008372 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008374 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008375 Py_DECREF(repunicode);
8376 goto onError;
8377 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 repsize = PyUnicode_GET_LENGTH(repunicode);
8380 if (charmaptranslate_makespace(&output, &osize,
8381 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 Py_DECREF(repunicode);
8383 goto onError;
8384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 for (uni2 = 0; repsize-->0; ++uni2)
8386 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8387 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008389 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 }
8391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8393 if (!res)
8394 goto onError;
8395 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 Py_XDECREF(exc);
8397 Py_XDECREF(errorHandler);
8398 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008399
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402 Py_XDECREF(exc);
8403 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 return NULL;
8405}
8406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407/* Deprecated. Use PyUnicode_Translate instead. */
8408PyObject *
8409PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8410 Py_ssize_t size,
8411 PyObject *mapping,
8412 const char *errors)
8413{
Christian Heimes5f520f42012-09-11 14:03:25 +02008414 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8416 if (!unicode)
8417 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008418 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8419 Py_DECREF(unicode);
8420 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421}
8422
Alexander Belopolsky40018472011-02-26 01:02:56 +00008423PyObject *
8424PyUnicode_Translate(PyObject *str,
8425 PyObject *mapping,
8426 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427{
8428 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008429
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 str = PyUnicode_FromObject(str);
8431 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008432 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 Py_DECREF(str);
8435 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008436}
Tim Petersced69f82003-09-16 20:30:58 +00008437
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008438static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008439fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440{
8441 /* No need to call PyUnicode_READY(self) because this function is only
8442 called as a callback from fixup() which does it already. */
8443 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8444 const int kind = PyUnicode_KIND(self);
8445 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008446 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008447 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 Py_ssize_t i;
8449
8450 for (i = 0; i < len; ++i) {
8451 ch = PyUnicode_READ(kind, data, i);
8452 fixed = 0;
8453 if (ch > 127) {
8454 if (Py_UNICODE_ISSPACE(ch))
8455 fixed = ' ';
8456 else {
8457 const int decimal = Py_UNICODE_TODECIMAL(ch);
8458 if (decimal >= 0)
8459 fixed = '0' + decimal;
8460 }
8461 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008462 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008463 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 PyUnicode_WRITE(kind, data, i, fixed);
8465 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008466 else
8467 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 }
8470
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008471 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472}
8473
8474PyObject *
8475_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8476{
8477 if (!PyUnicode_Check(unicode)) {
8478 PyErr_BadInternalCall();
8479 return NULL;
8480 }
8481 if (PyUnicode_READY(unicode) == -1)
8482 return NULL;
8483 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8484 /* If the string is already ASCII, just return the same string */
8485 Py_INCREF(unicode);
8486 return unicode;
8487 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008488 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489}
8490
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008491PyObject *
8492PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8493 Py_ssize_t length)
8494{
Victor Stinnerf0124502011-11-21 23:12:56 +01008495 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008496 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008497 Py_UCS4 maxchar;
8498 enum PyUnicode_Kind kind;
8499 void *data;
8500
Victor Stinner99d7ad02012-02-22 13:37:39 +01008501 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008502 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008503 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008504 if (ch > 127) {
8505 int decimal = Py_UNICODE_TODECIMAL(ch);
8506 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008507 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008508 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008509 }
8510 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008511
8512 /* Copy to a new string */
8513 decimal = PyUnicode_New(length, maxchar);
8514 if (decimal == NULL)
8515 return decimal;
8516 kind = PyUnicode_KIND(decimal);
8517 data = PyUnicode_DATA(decimal);
8518 /* Iterate over code points */
8519 for (i = 0; i < length; i++) {
8520 Py_UNICODE ch = s[i];
8521 if (ch > 127) {
8522 int decimal = Py_UNICODE_TODECIMAL(ch);
8523 if (decimal >= 0)
8524 ch = '0' + decimal;
8525 }
8526 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008528 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008529}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008530/* --- Decimal Encoder ---------------------------------------------------- */
8531
Alexander Belopolsky40018472011-02-26 01:02:56 +00008532int
8533PyUnicode_EncodeDecimal(Py_UNICODE *s,
8534 Py_ssize_t length,
8535 char *output,
8536 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008537{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008538 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008539 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008540 enum PyUnicode_Kind kind;
8541 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008542
8543 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 PyErr_BadArgument();
8545 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008546 }
8547
Victor Stinner42bf7752011-11-21 22:52:58 +01008548 unicode = PyUnicode_FromUnicode(s, length);
8549 if (unicode == NULL)
8550 return -1;
8551
Benjamin Petersonbac79492012-01-14 13:34:47 -05008552 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008553 Py_DECREF(unicode);
8554 return -1;
8555 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008556 kind = PyUnicode_KIND(unicode);
8557 data = PyUnicode_DATA(unicode);
8558
Victor Stinnerb84d7232011-11-22 01:50:07 +01008559 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008560 PyObject *exc;
8561 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008563 Py_ssize_t startpos;
8564
8565 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008566
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008568 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008569 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 decimal = Py_UNICODE_TODECIMAL(ch);
8573 if (decimal >= 0) {
8574 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008575 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 continue;
8577 }
8578 if (0 < ch && ch < 256) {
8579 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008580 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 continue;
8582 }
Victor Stinner6345be92011-11-25 20:09:01 +01008583
Victor Stinner42bf7752011-11-21 22:52:58 +01008584 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008585 exc = NULL;
8586 raise_encode_exception(&exc, "decimal", unicode,
8587 startpos, startpos+1,
8588 "invalid decimal Unicode string");
8589 Py_XDECREF(exc);
8590 Py_DECREF(unicode);
8591 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008592 }
8593 /* 0-terminate the output string */
8594 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008595 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008596 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008597}
8598
Guido van Rossumd57fd912000-03-10 22:53:23 +00008599/* --- Helpers ------------------------------------------------------------ */
8600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008602any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 Py_ssize_t start,
8604 Py_ssize_t end)
8605{
8606 int kind1, kind2, kind;
8607 void *buf1, *buf2;
8608 Py_ssize_t len1, len2, result;
8609
8610 kind1 = PyUnicode_KIND(s1);
8611 kind2 = PyUnicode_KIND(s2);
8612 kind = kind1 > kind2 ? kind1 : kind2;
8613 buf1 = PyUnicode_DATA(s1);
8614 buf2 = PyUnicode_DATA(s2);
8615 if (kind1 != kind)
8616 buf1 = _PyUnicode_AsKind(s1, kind);
8617 if (!buf1)
8618 return -2;
8619 if (kind2 != kind)
8620 buf2 = _PyUnicode_AsKind(s2, kind);
8621 if (!buf2) {
8622 if (kind1 != kind) PyMem_Free(buf1);
8623 return -2;
8624 }
8625 len1 = PyUnicode_GET_LENGTH(s1);
8626 len2 = PyUnicode_GET_LENGTH(s2);
8627
Victor Stinner794d5672011-10-10 03:21:36 +02008628 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008629 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008630 case PyUnicode_1BYTE_KIND:
8631 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8632 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8633 else
8634 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8635 break;
8636 case PyUnicode_2BYTE_KIND:
8637 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8638 break;
8639 case PyUnicode_4BYTE_KIND:
8640 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8641 break;
8642 default:
8643 assert(0); result = -2;
8644 }
8645 }
8646 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008647 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008648 case PyUnicode_1BYTE_KIND:
8649 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8650 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8651 else
8652 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8653 break;
8654 case PyUnicode_2BYTE_KIND:
8655 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8656 break;
8657 case PyUnicode_4BYTE_KIND:
8658 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8659 break;
8660 default:
8661 assert(0); result = -2;
8662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 }
8664
8665 if (kind1 != kind)
8666 PyMem_Free(buf1);
8667 if (kind2 != kind)
8668 PyMem_Free(buf2);
8669
8670 return result;
8671}
8672
8673Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008674_PyUnicode_InsertThousandsGrouping(
8675 PyObject *unicode, Py_ssize_t index,
8676 Py_ssize_t n_buffer,
8677 void *digits, Py_ssize_t n_digits,
8678 Py_ssize_t min_width,
8679 const char *grouping, PyObject *thousands_sep,
8680 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681{
Victor Stinner41a863c2012-02-24 00:37:51 +01008682 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008683 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008684 Py_ssize_t thousands_sep_len;
8685 Py_ssize_t len;
8686
8687 if (unicode != NULL) {
8688 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008689 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008690 }
8691 else {
8692 kind = PyUnicode_1BYTE_KIND;
8693 data = NULL;
8694 }
8695 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8696 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8697 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8698 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008699 if (thousands_sep_kind < kind) {
8700 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8701 if (!thousands_sep_data)
8702 return -1;
8703 }
8704 else {
8705 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8706 if (!data)
8707 return -1;
8708 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008709 }
8710
Benjamin Petersonead6b532011-12-20 17:23:42 -06008711 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008713 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008714 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008715 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008716 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008717 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008718 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008719 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008720 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008721 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008722 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008723 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008725 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008726 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008727 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008728 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008729 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008731 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008732 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008733 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008734 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008735 break;
8736 default:
8737 assert(0);
8738 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008740 if (unicode != NULL && thousands_sep_kind != kind) {
8741 if (thousands_sep_kind < kind)
8742 PyMem_Free(thousands_sep_data);
8743 else
8744 PyMem_Free(data);
8745 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008746 if (unicode == NULL) {
8747 *maxchar = 127;
8748 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008749 *maxchar = MAX_MAXCHAR(*maxchar,
8750 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008751 }
8752 }
8753 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008754}
8755
8756
Thomas Wouters477c8d52006-05-27 19:21:47 +00008757/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008758#define ADJUST_INDICES(start, end, len) \
8759 if (end > len) \
8760 end = len; \
8761 else if (end < 0) { \
8762 end += len; \
8763 if (end < 0) \
8764 end = 0; \
8765 } \
8766 if (start < 0) { \
8767 start += len; \
8768 if (start < 0) \
8769 start = 0; \
8770 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008771
Alexander Belopolsky40018472011-02-26 01:02:56 +00008772Py_ssize_t
8773PyUnicode_Count(PyObject *str,
8774 PyObject *substr,
8775 Py_ssize_t start,
8776 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008778 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008779 PyObject* str_obj;
8780 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008781 int kind1, kind2, kind;
8782 void *buf1 = NULL, *buf2 = NULL;
8783 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008784
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008785 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008786 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008788 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008789 if (!sub_obj) {
8790 Py_DECREF(str_obj);
8791 return -1;
8792 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008793 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008794 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 Py_DECREF(str_obj);
8796 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797 }
Tim Petersced69f82003-09-16 20:30:58 +00008798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 kind1 = PyUnicode_KIND(str_obj);
8800 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008801 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008804 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008805 if (kind2 > kind) {
8806 Py_DECREF(sub_obj);
8807 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008808 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008809 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008810 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 if (!buf2)
8813 goto onError;
8814 len1 = PyUnicode_GET_LENGTH(str_obj);
8815 len2 = PyUnicode_GET_LENGTH(sub_obj);
8816
8817 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008818 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008820 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8821 result = asciilib_count(
8822 ((Py_UCS1*)buf1) + start, end - start,
8823 buf2, len2, PY_SSIZE_T_MAX
8824 );
8825 else
8826 result = ucs1lib_count(
8827 ((Py_UCS1*)buf1) + start, end - start,
8828 buf2, len2, PY_SSIZE_T_MAX
8829 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830 break;
8831 case PyUnicode_2BYTE_KIND:
8832 result = ucs2lib_count(
8833 ((Py_UCS2*)buf1) + start, end - start,
8834 buf2, len2, PY_SSIZE_T_MAX
8835 );
8836 break;
8837 case PyUnicode_4BYTE_KIND:
8838 result = ucs4lib_count(
8839 ((Py_UCS4*)buf1) + start, end - start,
8840 buf2, len2, PY_SSIZE_T_MAX
8841 );
8842 break;
8843 default:
8844 assert(0); result = 0;
8845 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008846
8847 Py_DECREF(sub_obj);
8848 Py_DECREF(str_obj);
8849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 if (kind2 != kind)
8851 PyMem_Free(buf2);
8852
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 onError:
8855 Py_DECREF(sub_obj);
8856 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 if (kind2 != kind && buf2)
8858 PyMem_Free(buf2);
8859 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008860}
8861
Alexander Belopolsky40018472011-02-26 01:02:56 +00008862Py_ssize_t
8863PyUnicode_Find(PyObject *str,
8864 PyObject *sub,
8865 Py_ssize_t start,
8866 Py_ssize_t end,
8867 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008868{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008869 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008870
Guido van Rossumd57fd912000-03-10 22:53:23 +00008871 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008872 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008874 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008875 if (!sub) {
8876 Py_DECREF(str);
8877 return -2;
8878 }
8879 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8880 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 Py_DECREF(str);
8882 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883 }
Tim Petersced69f82003-09-16 20:30:58 +00008884
Victor Stinner794d5672011-10-10 03:21:36 +02008885 result = any_find_slice(direction,
8886 str, sub, start, end
8887 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008888
Guido van Rossumd57fd912000-03-10 22:53:23 +00008889 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008890 Py_DECREF(sub);
8891
Guido van Rossumd57fd912000-03-10 22:53:23 +00008892 return result;
8893}
8894
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895Py_ssize_t
8896PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8897 Py_ssize_t start, Py_ssize_t end,
8898 int direction)
8899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008901 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008902 if (PyUnicode_READY(str) == -1)
8903 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008904 if (start < 0 || end < 0) {
8905 PyErr_SetString(PyExc_IndexError, "string index out of range");
8906 return -2;
8907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 if (end > PyUnicode_GET_LENGTH(str))
8909 end = PyUnicode_GET_LENGTH(str);
8910 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008911 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8912 kind, end-start, ch, direction);
8913 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008915 else
8916 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917}
8918
Alexander Belopolsky40018472011-02-26 01:02:56 +00008919static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008920tailmatch(PyObject *self,
8921 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008922 Py_ssize_t start,
8923 Py_ssize_t end,
8924 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 int kind_self;
8927 int kind_sub;
8928 void *data_self;
8929 void *data_sub;
8930 Py_ssize_t offset;
8931 Py_ssize_t i;
8932 Py_ssize_t end_sub;
8933
8934 if (PyUnicode_READY(self) == -1 ||
8935 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01008936 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937
8938 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008939 return 1;
8940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8942 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008943 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 kind_self = PyUnicode_KIND(self);
8947 data_self = PyUnicode_DATA(self);
8948 kind_sub = PyUnicode_KIND(substring);
8949 data_sub = PyUnicode_DATA(substring);
8950 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8951
8952 if (direction > 0)
8953 offset = end;
8954 else
8955 offset = start;
8956
8957 if (PyUnicode_READ(kind_self, data_self, offset) ==
8958 PyUnicode_READ(kind_sub, data_sub, 0) &&
8959 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8960 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8961 /* If both are of the same kind, memcmp is sufficient */
8962 if (kind_self == kind_sub) {
8963 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008964 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 data_sub,
8966 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008967 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968 }
8969 /* otherwise we have to compare each character by first accesing it */
8970 else {
8971 /* We do not need to compare 0 and len(substring)-1 because
8972 the if statement above ensured already that they are equal
8973 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 for (i = 1; i < end_sub; ++i) {
8975 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8976 PyUnicode_READ(kind_sub, data_sub, i))
8977 return 0;
8978 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008979 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008981 }
8982
8983 return 0;
8984}
8985
Alexander Belopolsky40018472011-02-26 01:02:56 +00008986Py_ssize_t
8987PyUnicode_Tailmatch(PyObject *str,
8988 PyObject *substr,
8989 Py_ssize_t start,
8990 Py_ssize_t end,
8991 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008992{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008993 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008994
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995 str = PyUnicode_FromObject(str);
8996 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998 substr = PyUnicode_FromObject(substr);
8999 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 Py_DECREF(str);
9001 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 }
Tim Petersced69f82003-09-16 20:30:58 +00009003
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009004 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009005 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006 Py_DECREF(str);
9007 Py_DECREF(substr);
9008 return result;
9009}
9010
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011/* Apply fixfct filter to the Unicode object self and return a
9012 reference to the modified object */
9013
Alexander Belopolsky40018472011-02-26 01:02:56 +00009014static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009015fixup(PyObject *self,
9016 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 PyObject *u;
9019 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009020 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009022 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009024 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009025 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 /* fix functions return the new maximum character in a string,
9028 if the kind of the resulting unicode object does not change,
9029 everything is fine. Otherwise we need to change the string kind
9030 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009031 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009032
9033 if (maxchar_new == 0) {
9034 /* no changes */;
9035 if (PyUnicode_CheckExact(self)) {
9036 Py_DECREF(u);
9037 Py_INCREF(self);
9038 return self;
9039 }
9040 else
9041 return u;
9042 }
9043
Victor Stinnere6abb482012-05-02 01:15:40 +02009044 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045
Victor Stinnereaab6042011-12-11 22:22:39 +01009046 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009048
9049 /* In case the maximum character changed, we need to
9050 convert the string to the new category. */
9051 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9052 if (v == NULL) {
9053 Py_DECREF(u);
9054 return NULL;
9055 }
9056 if (maxchar_new > maxchar_old) {
9057 /* If the maxchar increased so that the kind changed, not all
9058 characters are representable anymore and we need to fix the
9059 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009060 _PyUnicode_FastCopyCharacters(v, 0,
9061 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009062 maxchar_old = fixfct(v);
9063 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 }
9065 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009066 _PyUnicode_FastCopyCharacters(v, 0,
9067 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009069 Py_DECREF(u);
9070 assert(_PyUnicode_CheckConsistency(v, 1));
9071 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009072}
9073
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009074static PyObject *
9075ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009077 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9078 char *resdata, *data = PyUnicode_DATA(self);
9079 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009080
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009081 res = PyUnicode_New(len, 127);
9082 if (res == NULL)
9083 return NULL;
9084 resdata = PyUnicode_DATA(res);
9085 if (lower)
9086 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009088 _Py_bytes_upper(resdata, data, len);
9089 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009090}
9091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009093handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009095 Py_ssize_t j;
9096 int final_sigma;
9097 Py_UCS4 c;
9098 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009099
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009100 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9101
9102 where ! is a negation and \p{xxx} is a character with property xxx.
9103 */
9104 for (j = i - 1; j >= 0; j--) {
9105 c = PyUnicode_READ(kind, data, j);
9106 if (!_PyUnicode_IsCaseIgnorable(c))
9107 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009109 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9110 if (final_sigma) {
9111 for (j = i + 1; j < length; j++) {
9112 c = PyUnicode_READ(kind, data, j);
9113 if (!_PyUnicode_IsCaseIgnorable(c))
9114 break;
9115 }
9116 final_sigma = j == length || !_PyUnicode_IsCased(c);
9117 }
9118 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119}
9120
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009121static int
9122lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9123 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009125 /* Obscure special case. */
9126 if (c == 0x3A3) {
9127 mapped[0] = handle_capital_sigma(kind, data, length, i);
9128 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009130 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131}
9132
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009133static Py_ssize_t
9134do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009136 Py_ssize_t i, k = 0;
9137 int n_res, j;
9138 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009139
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009140 c = PyUnicode_READ(kind, data, 0);
9141 n_res = _PyUnicode_ToUpperFull(c, mapped);
9142 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009143 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009144 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009146 for (i = 1; i < length; i++) {
9147 c = PyUnicode_READ(kind, data, i);
9148 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9149 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009150 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009151 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009152 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009153 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009154 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155}
9156
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009157static Py_ssize_t
9158do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9159 Py_ssize_t i, k = 0;
9160
9161 for (i = 0; i < length; i++) {
9162 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9163 int n_res, j;
9164 if (Py_UNICODE_ISUPPER(c)) {
9165 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9166 }
9167 else if (Py_UNICODE_ISLOWER(c)) {
9168 n_res = _PyUnicode_ToUpperFull(c, mapped);
9169 }
9170 else {
9171 n_res = 1;
9172 mapped[0] = c;
9173 }
9174 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009175 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009176 res[k++] = mapped[j];
9177 }
9178 }
9179 return k;
9180}
9181
9182static Py_ssize_t
9183do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9184 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009186 Py_ssize_t i, k = 0;
9187
9188 for (i = 0; i < length; i++) {
9189 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9190 int n_res, j;
9191 if (lower)
9192 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9193 else
9194 n_res = _PyUnicode_ToUpperFull(c, mapped);
9195 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009196 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009197 res[k++] = mapped[j];
9198 }
9199 }
9200 return k;
9201}
9202
9203static Py_ssize_t
9204do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9205{
9206 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9207}
9208
9209static Py_ssize_t
9210do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9211{
9212 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9213}
9214
Benjamin Petersone51757f2012-01-12 21:10:29 -05009215static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009216do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9217{
9218 Py_ssize_t i, k = 0;
9219
9220 for (i = 0; i < length; i++) {
9221 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9222 Py_UCS4 mapped[3];
9223 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9224 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009225 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009226 res[k++] = mapped[j];
9227 }
9228 }
9229 return k;
9230}
9231
9232static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009233do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9234{
9235 Py_ssize_t i, k = 0;
9236 int previous_is_cased;
9237
9238 previous_is_cased = 0;
9239 for (i = 0; i < length; i++) {
9240 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9241 Py_UCS4 mapped[3];
9242 int n_res, j;
9243
9244 if (previous_is_cased)
9245 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9246 else
9247 n_res = _PyUnicode_ToTitleFull(c, mapped);
9248
9249 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009250 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009251 res[k++] = mapped[j];
9252 }
9253
9254 previous_is_cased = _PyUnicode_IsCased(c);
9255 }
9256 return k;
9257}
9258
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009259static PyObject *
9260case_operation(PyObject *self,
9261 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9262{
9263 PyObject *res = NULL;
9264 Py_ssize_t length, newlength = 0;
9265 int kind, outkind;
9266 void *data, *outdata;
9267 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9268
Benjamin Petersoneea48462012-01-16 14:28:50 -05009269 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009270
9271 kind = PyUnicode_KIND(self);
9272 data = PyUnicode_DATA(self);
9273 length = PyUnicode_GET_LENGTH(self);
9274 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9275 if (tmp == NULL)
9276 return PyErr_NoMemory();
9277 newlength = perform(kind, data, length, tmp, &maxchar);
9278 res = PyUnicode_New(newlength, maxchar);
9279 if (res == NULL)
9280 goto leave;
9281 tmpend = tmp + newlength;
9282 outdata = PyUnicode_DATA(res);
9283 outkind = PyUnicode_KIND(res);
9284 switch (outkind) {
9285 case PyUnicode_1BYTE_KIND:
9286 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9287 break;
9288 case PyUnicode_2BYTE_KIND:
9289 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9290 break;
9291 case PyUnicode_4BYTE_KIND:
9292 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9293 break;
9294 default:
9295 assert(0);
9296 break;
9297 }
9298 leave:
9299 PyMem_FREE(tmp);
9300 return res;
9301}
9302
Tim Peters8ce9f162004-08-27 01:49:32 +00009303PyObject *
9304PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009307 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009309 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009310 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9311 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009312 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009314 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009316 int use_memcpy;
9317 unsigned char *res_data = NULL, *sep_data = NULL;
9318 PyObject *last_obj;
9319 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009320
Tim Peters05eba1f2004-08-27 21:32:02 +00009321 fseq = PySequence_Fast(seq, "");
9322 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009323 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009324 }
9325
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009326 /* NOTE: the following code can't call back into Python code,
9327 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009328 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009329
Tim Peters05eba1f2004-08-27 21:32:02 +00009330 seqlen = PySequence_Fast_GET_SIZE(fseq);
9331 /* If empty sequence, return u"". */
9332 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009333 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009334 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009335 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009336
Tim Peters05eba1f2004-08-27 21:32:02 +00009337 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009338 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009339 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009340 if (seqlen == 1) {
9341 if (PyUnicode_CheckExact(items[0])) {
9342 res = items[0];
9343 Py_INCREF(res);
9344 Py_DECREF(fseq);
9345 return res;
9346 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009347 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009348 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009349 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009350 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009351 /* Set up sep and seplen */
9352 if (separator == NULL) {
9353 /* fall back to a blank space separator */
9354 sep = PyUnicode_FromOrdinal(' ');
9355 if (!sep)
9356 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009357 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009358 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009359 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009360 else {
9361 if (!PyUnicode_Check(separator)) {
9362 PyErr_Format(PyExc_TypeError,
9363 "separator: expected str instance,"
9364 " %.80s found",
9365 Py_TYPE(separator)->tp_name);
9366 goto onError;
9367 }
9368 if (PyUnicode_READY(separator))
9369 goto onError;
9370 sep = separator;
9371 seplen = PyUnicode_GET_LENGTH(separator);
9372 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9373 /* inc refcount to keep this code path symmetric with the
9374 above case of a blank separator */
9375 Py_INCREF(sep);
9376 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009377 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009378 }
9379
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009380 /* There are at least two things to join, or else we have a subclass
9381 * of str in the sequence.
9382 * Do a pre-pass to figure out the total amount of space we'll
9383 * need (sz), and see whether all argument are strings.
9384 */
9385 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009386#ifdef Py_DEBUG
9387 use_memcpy = 0;
9388#else
9389 use_memcpy = 1;
9390#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009391 for (i = 0; i < seqlen; i++) {
9392 const Py_ssize_t old_sz = sz;
9393 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 if (!PyUnicode_Check(item)) {
9395 PyErr_Format(PyExc_TypeError,
9396 "sequence item %zd: expected str instance,"
9397 " %.80s found",
9398 i, Py_TYPE(item)->tp_name);
9399 goto onError;
9400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 if (PyUnicode_READY(item) == -1)
9402 goto onError;
9403 sz += PyUnicode_GET_LENGTH(item);
9404 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009405 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009406 if (i != 0)
9407 sz += seplen;
9408 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9409 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009410 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009411 goto onError;
9412 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009413 if (use_memcpy && last_obj != NULL) {
9414 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9415 use_memcpy = 0;
9416 }
9417 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009418 }
Tim Petersced69f82003-09-16 20:30:58 +00009419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009421 if (res == NULL)
9422 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009423
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009424 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009425#ifdef Py_DEBUG
9426 use_memcpy = 0;
9427#else
9428 if (use_memcpy) {
9429 res_data = PyUnicode_1BYTE_DATA(res);
9430 kind = PyUnicode_KIND(res);
9431 if (seplen != 0)
9432 sep_data = PyUnicode_1BYTE_DATA(sep);
9433 }
9434#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009436 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009437 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009438 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009439 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009440 if (use_memcpy) {
9441 Py_MEMCPY(res_data,
9442 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009443 kind * seplen);
9444 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009445 }
9446 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009447 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009448 res_offset += seplen;
9449 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009450 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009451 itemlen = PyUnicode_GET_LENGTH(item);
9452 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009453 if (use_memcpy) {
9454 Py_MEMCPY(res_data,
9455 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009456 kind * itemlen);
9457 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009458 }
9459 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009460 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009461 res_offset += itemlen;
9462 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009463 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009464 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009465 if (use_memcpy)
9466 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009467 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009468 else
9469 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009470
Tim Peters05eba1f2004-08-27 21:32:02 +00009471 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009473 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475
Benjamin Peterson29060642009-01-31 22:14:21 +00009476 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009477 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009479 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 return NULL;
9481}
9482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483#define FILL(kind, data, value, start, length) \
9484 do { \
9485 Py_ssize_t i_ = 0; \
9486 assert(kind != PyUnicode_WCHAR_KIND); \
9487 switch ((kind)) { \
9488 case PyUnicode_1BYTE_KIND: { \
9489 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009490 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 break; \
9492 } \
9493 case PyUnicode_2BYTE_KIND: { \
9494 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9495 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9496 break; \
9497 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009498 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9500 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9501 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009502 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503 } \
9504 } \
9505 } while (0)
9506
Victor Stinnerd3f08822012-05-29 12:57:52 +02009507void
9508_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9509 Py_UCS4 fill_char)
9510{
9511 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9512 const void *data = PyUnicode_DATA(unicode);
9513 assert(PyUnicode_IS_READY(unicode));
9514 assert(unicode_modifiable(unicode));
9515 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9516 assert(start >= 0);
9517 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9518 FILL(kind, data, fill_char, start, length);
9519}
9520
Victor Stinner3fe55312012-01-04 00:33:50 +01009521Py_ssize_t
9522PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9523 Py_UCS4 fill_char)
9524{
9525 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009526
9527 if (!PyUnicode_Check(unicode)) {
9528 PyErr_BadInternalCall();
9529 return -1;
9530 }
9531 if (PyUnicode_READY(unicode) == -1)
9532 return -1;
9533 if (unicode_check_modifiable(unicode))
9534 return -1;
9535
Victor Stinnerd3f08822012-05-29 12:57:52 +02009536 if (start < 0) {
9537 PyErr_SetString(PyExc_IndexError, "string index out of range");
9538 return -1;
9539 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009540 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9541 PyErr_SetString(PyExc_ValueError,
9542 "fill character is bigger than "
9543 "the string maximum character");
9544 return -1;
9545 }
9546
9547 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9548 length = Py_MIN(maxlen, length);
9549 if (length <= 0)
9550 return 0;
9551
Victor Stinnerd3f08822012-05-29 12:57:52 +02009552 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009553 return length;
9554}
9555
Victor Stinner9310abb2011-10-05 00:59:23 +02009556static PyObject *
9557pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009558 Py_ssize_t left,
9559 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 PyObject *u;
9563 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009564 int kind;
9565 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566
9567 if (left < 0)
9568 left = 0;
9569 if (right < 0)
9570 right = 0;
9571
Victor Stinnerc4b49542011-12-11 22:44:26 +01009572 if (left == 0 && right == 0)
9573 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009575 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9576 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009577 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9578 return NULL;
9579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009581 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009583 if (!u)
9584 return NULL;
9585
9586 kind = PyUnicode_KIND(u);
9587 data = PyUnicode_DATA(u);
9588 if (left)
9589 FILL(kind, data, fill, 0, left);
9590 if (right)
9591 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009592 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009593 assert(_PyUnicode_CheckConsistency(u, 1));
9594 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595}
9596
Alexander Belopolsky40018472011-02-26 01:02:56 +00009597PyObject *
9598PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009601
9602 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009603 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009605 if (PyUnicode_READY(string) == -1) {
9606 Py_DECREF(string);
9607 return NULL;
9608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609
Benjamin Petersonead6b532011-12-20 17:23:42 -06009610 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009612 if (PyUnicode_IS_ASCII(string))
9613 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009614 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009615 PyUnicode_GET_LENGTH(string), keepends);
9616 else
9617 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009618 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009619 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 break;
9621 case PyUnicode_2BYTE_KIND:
9622 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009623 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 PyUnicode_GET_LENGTH(string), keepends);
9625 break;
9626 case PyUnicode_4BYTE_KIND:
9627 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009628 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 PyUnicode_GET_LENGTH(string), keepends);
9630 break;
9631 default:
9632 assert(0);
9633 list = 0;
9634 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635 Py_DECREF(string);
9636 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009637}
9638
Alexander Belopolsky40018472011-02-26 01:02:56 +00009639static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009640split(PyObject *self,
9641 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009642 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 int kind1, kind2, kind;
9645 void *buf1, *buf2;
9646 Py_ssize_t len1, len2;
9647 PyObject* out;
9648
Guido van Rossumd57fd912000-03-10 22:53:23 +00009649 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009650 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 if (PyUnicode_READY(self) == -1)
9653 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009656 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009658 if (PyUnicode_IS_ASCII(self))
9659 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009660 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009661 PyUnicode_GET_LENGTH(self), maxcount
9662 );
9663 else
9664 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009665 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009666 PyUnicode_GET_LENGTH(self), maxcount
9667 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 case PyUnicode_2BYTE_KIND:
9669 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009670 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 PyUnicode_GET_LENGTH(self), maxcount
9672 );
9673 case PyUnicode_4BYTE_KIND:
9674 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009675 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 PyUnicode_GET_LENGTH(self), maxcount
9677 );
9678 default:
9679 assert(0);
9680 return NULL;
9681 }
9682
9683 if (PyUnicode_READY(substring) == -1)
9684 return NULL;
9685
9686 kind1 = PyUnicode_KIND(self);
9687 kind2 = PyUnicode_KIND(substring);
9688 kind = kind1 > kind2 ? kind1 : kind2;
9689 buf1 = PyUnicode_DATA(self);
9690 buf2 = PyUnicode_DATA(substring);
9691 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009692 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009693 if (!buf1)
9694 return NULL;
9695 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009696 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 if (!buf2) {
9698 if (kind1 != kind) PyMem_Free(buf1);
9699 return NULL;
9700 }
9701 len1 = PyUnicode_GET_LENGTH(self);
9702 len2 = PyUnicode_GET_LENGTH(substring);
9703
Benjamin Petersonead6b532011-12-20 17:23:42 -06009704 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009706 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9707 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009708 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009709 else
9710 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009711 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 break;
9713 case PyUnicode_2BYTE_KIND:
9714 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009715 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 break;
9717 case PyUnicode_4BYTE_KIND:
9718 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009719 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 break;
9721 default:
9722 out = NULL;
9723 }
9724 if (kind1 != kind)
9725 PyMem_Free(buf1);
9726 if (kind2 != kind)
9727 PyMem_Free(buf2);
9728 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729}
9730
Alexander Belopolsky40018472011-02-26 01:02:56 +00009731static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009732rsplit(PyObject *self,
9733 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009734 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009735{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 int kind1, kind2, kind;
9737 void *buf1, *buf2;
9738 Py_ssize_t len1, len2;
9739 PyObject* out;
9740
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009741 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009742 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009744 if (PyUnicode_READY(self) == -1)
9745 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009747 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009748 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009750 if (PyUnicode_IS_ASCII(self))
9751 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009752 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009753 PyUnicode_GET_LENGTH(self), maxcount
9754 );
9755 else
9756 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009757 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009758 PyUnicode_GET_LENGTH(self), maxcount
9759 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 case PyUnicode_2BYTE_KIND:
9761 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009762 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 PyUnicode_GET_LENGTH(self), maxcount
9764 );
9765 case PyUnicode_4BYTE_KIND:
9766 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009767 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 PyUnicode_GET_LENGTH(self), maxcount
9769 );
9770 default:
9771 assert(0);
9772 return NULL;
9773 }
9774
9775 if (PyUnicode_READY(substring) == -1)
9776 return NULL;
9777
9778 kind1 = PyUnicode_KIND(self);
9779 kind2 = PyUnicode_KIND(substring);
9780 kind = kind1 > kind2 ? kind1 : kind2;
9781 buf1 = PyUnicode_DATA(self);
9782 buf2 = PyUnicode_DATA(substring);
9783 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009784 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 if (!buf1)
9786 return NULL;
9787 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009788 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 if (!buf2) {
9790 if (kind1 != kind) PyMem_Free(buf1);
9791 return NULL;
9792 }
9793 len1 = PyUnicode_GET_LENGTH(self);
9794 len2 = PyUnicode_GET_LENGTH(substring);
9795
Benjamin Petersonead6b532011-12-20 17:23:42 -06009796 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009798 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9799 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009800 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009801 else
9802 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009803 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 break;
9805 case PyUnicode_2BYTE_KIND:
9806 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009807 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 break;
9809 case PyUnicode_4BYTE_KIND:
9810 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009811 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 break;
9813 default:
9814 out = NULL;
9815 }
9816 if (kind1 != kind)
9817 PyMem_Free(buf1);
9818 if (kind2 != kind)
9819 PyMem_Free(buf2);
9820 return out;
9821}
9822
9823static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009824anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9825 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009826{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009827 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009829 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9830 return asciilib_find(buf1, len1, buf2, len2, offset);
9831 else
9832 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 case PyUnicode_2BYTE_KIND:
9834 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9835 case PyUnicode_4BYTE_KIND:
9836 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9837 }
9838 assert(0);
9839 return -1;
9840}
9841
9842static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009843anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9844 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009846 switch (kind) {
9847 case PyUnicode_1BYTE_KIND:
9848 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9849 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9850 else
9851 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9852 case PyUnicode_2BYTE_KIND:
9853 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9854 case PyUnicode_4BYTE_KIND:
9855 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9856 }
9857 assert(0);
9858 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009859}
9860
Alexander Belopolsky40018472011-02-26 01:02:56 +00009861static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862replace(PyObject *self, PyObject *str1,
9863 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 PyObject *u;
9866 char *sbuf = PyUnicode_DATA(self);
9867 char *buf1 = PyUnicode_DATA(str1);
9868 char *buf2 = PyUnicode_DATA(str2);
9869 int srelease = 0, release1 = 0, release2 = 0;
9870 int skind = PyUnicode_KIND(self);
9871 int kind1 = PyUnicode_KIND(str1);
9872 int kind2 = PyUnicode_KIND(str2);
9873 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9874 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9875 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009876 int mayshrink;
9877 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009878
9879 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009880 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009882 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883
Victor Stinner59de0ee2011-10-07 10:01:28 +02009884 if (str1 == str2)
9885 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009886 if (skind < kind1)
9887 /* substring too wide to be present */
9888 goto nothing;
9889
Victor Stinner49a0a212011-10-12 23:46:10 +02009890 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9891 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9892 /* Replacing str1 with str2 may cause a maxchar reduction in the
9893 result string. */
9894 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009895 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009898 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009900 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009902 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009903 Py_UCS4 u1, u2;
9904 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009905 Py_ssize_t index, pos;
9906 char *src;
9907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009909 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9910 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009911 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009914 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009916 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009918
9919 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9920 index = 0;
9921 src = sbuf;
9922 while (--maxcount)
9923 {
9924 pos++;
9925 src += pos * PyUnicode_KIND(self);
9926 slen -= pos;
9927 index += pos;
9928 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9929 if (pos < 0)
9930 break;
9931 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9932 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009933 }
9934 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 int rkind = skind;
9936 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009937 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 if (kind1 < rkind) {
9940 /* widen substring */
9941 buf1 = _PyUnicode_AsKind(str1, rkind);
9942 if (!buf1) goto error;
9943 release1 = 1;
9944 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009945 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009946 if (i < 0)
9947 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 if (rkind > kind2) {
9949 /* widen replacement */
9950 buf2 = _PyUnicode_AsKind(str2, rkind);
9951 if (!buf2) goto error;
9952 release2 = 1;
9953 }
9954 else if (rkind < kind2) {
9955 /* widen self and buf1 */
9956 rkind = kind2;
9957 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +01009958 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 sbuf = _PyUnicode_AsKind(self, rkind);
9960 if (!sbuf) goto error;
9961 srelease = 1;
9962 buf1 = _PyUnicode_AsKind(str1, rkind);
9963 if (!buf1) goto error;
9964 release1 = 1;
9965 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009966 u = PyUnicode_New(slen, maxchar);
9967 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009969 assert(PyUnicode_KIND(u) == rkind);
9970 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009971
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009972 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009973 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009974 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009976 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009977 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009978
9979 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009980 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009981 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009983 if (i == -1)
9984 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009985 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009987 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009991 }
9992 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01009994 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 int rkind = skind;
9996 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009999 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 buf1 = _PyUnicode_AsKind(str1, rkind);
10001 if (!buf1) goto error;
10002 release1 = 1;
10003 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010004 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010005 if (n == 0)
10006 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010008 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 buf2 = _PyUnicode_AsKind(str2, rkind);
10010 if (!buf2) goto error;
10011 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010014 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 rkind = kind2;
10016 sbuf = _PyUnicode_AsKind(self, rkind);
10017 if (!sbuf) goto error;
10018 srelease = 1;
10019 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010020 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 buf1 = _PyUnicode_AsKind(str1, rkind);
10022 if (!buf1) goto error;
10023 release1 = 1;
10024 }
10025 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10026 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010027 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 PyErr_SetString(PyExc_OverflowError,
10029 "replace string is too long");
10030 goto error;
10031 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010032 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010033 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010034 _Py_INCREF_UNICODE_EMPTY();
10035 if (!unicode_empty)
10036 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010037 u = unicode_empty;
10038 goto done;
10039 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010040 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 PyErr_SetString(PyExc_OverflowError,
10042 "replace string is too long");
10043 goto error;
10044 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010045 u = PyUnicode_New(new_size, maxchar);
10046 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010048 assert(PyUnicode_KIND(u) == rkind);
10049 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 ires = i = 0;
10051 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010052 while (n-- > 0) {
10053 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010054 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010055 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010056 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010057 if (j == -1)
10058 break;
10059 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010060 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010061 memcpy(res + rkind * ires,
10062 sbuf + rkind * i,
10063 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010065 }
10066 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010068 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010070 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010072 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010076 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010077 memcpy(res + rkind * ires,
10078 sbuf + rkind * i,
10079 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010080 }
10081 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010082 /* interleave */
10083 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010084 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010086 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010088 if (--n <= 0)
10089 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010090 memcpy(res + rkind * ires,
10091 sbuf + rkind * i,
10092 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 ires++;
10094 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010095 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010096 memcpy(res + rkind * ires,
10097 sbuf + rkind * i,
10098 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010099 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010100 }
10101
10102 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010103 unicode_adjust_maxchar(&u);
10104 if (u == NULL)
10105 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010107
10108 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 if (srelease)
10110 PyMem_FREE(sbuf);
10111 if (release1)
10112 PyMem_FREE(buf1);
10113 if (release2)
10114 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010115 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010117
Benjamin Peterson29060642009-01-31 22:14:21 +000010118 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010119 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 if (srelease)
10121 PyMem_FREE(sbuf);
10122 if (release1)
10123 PyMem_FREE(buf1);
10124 if (release2)
10125 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010126 return unicode_result_unchanged(self);
10127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 error:
10129 if (srelease && sbuf)
10130 PyMem_FREE(sbuf);
10131 if (release1 && buf1)
10132 PyMem_FREE(buf1);
10133 if (release2 && buf2)
10134 PyMem_FREE(buf2);
10135 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136}
10137
10138/* --- Unicode Object Methods --------------------------------------------- */
10139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010140PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010141 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010142\n\
10143Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010144characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145
10146static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010147unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010149 if (PyUnicode_READY(self) == -1)
10150 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010151 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010152}
10153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010154PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010155 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156\n\
10157Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010158have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010159
10160static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010161unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010162{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010163 if (PyUnicode_READY(self) == -1)
10164 return NULL;
10165 if (PyUnicode_GET_LENGTH(self) == 0)
10166 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010167 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168}
10169
Benjamin Petersond5890c82012-01-14 13:23:30 -050010170PyDoc_STRVAR(casefold__doc__,
10171 "S.casefold() -> str\n\
10172\n\
10173Return a version of S suitable for caseless comparisons.");
10174
10175static PyObject *
10176unicode_casefold(PyObject *self)
10177{
10178 if (PyUnicode_READY(self) == -1)
10179 return NULL;
10180 if (PyUnicode_IS_ASCII(self))
10181 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010182 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010183}
10184
10185
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010186/* Argument converter. Coerces to a single unicode character */
10187
10188static int
10189convert_uc(PyObject *obj, void *addr)
10190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010192 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010193
Benjamin Peterson14339b62009-01-31 16:36:08 +000010194 uniobj = PyUnicode_FromObject(obj);
10195 if (uniobj == NULL) {
10196 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010197 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010198 return 0;
10199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010201 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010202 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010203 Py_DECREF(uniobj);
10204 return 0;
10205 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010207 Py_DECREF(uniobj);
10208 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010209}
10210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010211PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010212 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010214Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010215done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216
10217static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010218unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010220 Py_ssize_t marg, left;
10221 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 Py_UCS4 fillchar = ' ';
10223
Victor Stinnere9a29352011-10-01 02:14:59 +020010224 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226
Benjamin Petersonbac79492012-01-14 13:34:47 -050010227 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228 return NULL;
10229
Victor Stinnerc4b49542011-12-11 22:44:26 +010010230 if (PyUnicode_GET_LENGTH(self) >= width)
10231 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232
Victor Stinnerc4b49542011-12-11 22:44:26 +010010233 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234 left = marg / 2 + (marg & width & 1);
10235
Victor Stinner9310abb2011-10-05 00:59:23 +020010236 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237}
10238
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239/* This function assumes that str1 and str2 are readied by the caller. */
10240
Marc-André Lemburge5034372000-08-08 08:04:29 +000010241static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010242unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 int kind1, kind2;
10245 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010246 Py_ssize_t len1, len2;
10247 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010248
Victor Stinner90db9c42012-10-04 21:53:50 +020010249 /* a string is equal to itself */
10250 if (str1 == str2)
10251 return 0;
10252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 kind1 = PyUnicode_KIND(str1);
10254 kind2 = PyUnicode_KIND(str2);
10255 data1 = PyUnicode_DATA(str1);
10256 data2 = PyUnicode_DATA(str2);
10257 len1 = PyUnicode_GET_LENGTH(str1);
10258 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010259 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010260
Victor Stinner770e19e2012-10-04 22:59:45 +020010261 if (kind1 == 1 && kind2 == 1) {
10262 int cmp = memcmp(data1, data2, len);
10263 /* normalize result of memcmp() into the range [-1; 1] */
10264 if (cmp < 0)
10265 return -1;
10266 if (cmp > 0)
10267 return 1;
10268 }
10269 else {
10270 for (i = 0; i < len; ++i) {
10271 Py_UCS4 c1, c2;
10272 c1 = PyUnicode_READ(kind1, data1, i);
10273 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010274
Victor Stinner770e19e2012-10-04 22:59:45 +020010275 if (c1 != c2)
10276 return (c1 < c2) ? -1 : 1;
10277 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010278 }
10279
Victor Stinner770e19e2012-10-04 22:59:45 +020010280 if (len1 == len2)
10281 return 0;
10282 if (len1 < len2)
10283 return -1;
10284 else
10285 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010286}
10287
Victor Stinnere5567ad2012-10-23 02:48:49 +020010288static int
10289unicode_compare_eq(PyObject *str1, PyObject *str2)
10290{
10291 int kind;
10292 void *data1, *data2;
10293 Py_ssize_t len;
10294 int cmp;
10295
10296 /* a string is equal to itself */
10297 if (str1 == str2)
10298 return 1;
10299
10300 len = PyUnicode_GET_LENGTH(str1);
10301 if (PyUnicode_GET_LENGTH(str2) != len)
10302 return 0;
10303 kind = PyUnicode_KIND(str1);
10304 if (PyUnicode_KIND(str2) != kind)
10305 return 0;
10306 data1 = PyUnicode_DATA(str1);
10307 data2 = PyUnicode_DATA(str2);
10308
10309 cmp = memcmp(data1, data2, len * kind);
10310 return (cmp == 0);
10311}
10312
10313
Alexander Belopolsky40018472011-02-26 01:02:56 +000010314int
10315PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10318 if (PyUnicode_READY(left) == -1 ||
10319 PyUnicode_READY(right) == -1)
10320 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010321 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010323 PyErr_Format(PyExc_TypeError,
10324 "Can't compare %.100s and %.100s",
10325 left->ob_type->tp_name,
10326 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327 return -1;
10328}
10329
Martin v. Löwis5b222132007-06-10 09:51:05 +000010330int
10331PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 Py_ssize_t i;
10334 int kind;
10335 void *data;
10336 Py_UCS4 chr;
10337
Victor Stinner910337b2011-10-03 03:20:16 +020010338 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (PyUnicode_READY(uni) == -1)
10340 return -1;
10341 kind = PyUnicode_KIND(uni);
10342 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010343 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10345 if (chr != str[i])
10346 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010347 /* This check keeps Python strings that end in '\0' from comparing equal
10348 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010350 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010351 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010352 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010353 return 0;
10354}
10355
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010356
Benjamin Peterson29060642009-01-31 22:14:21 +000010357#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010358 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010359
Alexander Belopolsky40018472011-02-26 01:02:56 +000010360PyObject *
10361PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010362{
10363 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010364 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010365
Victor Stinnere5567ad2012-10-23 02:48:49 +020010366 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10367 Py_RETURN_NOTIMPLEMENTED;
10368
10369 if (PyUnicode_READY(left) == -1 ||
10370 PyUnicode_READY(right) == -1)
10371 return NULL;
10372
10373 if (op == Py_EQ || op == Py_NE) {
10374 result = unicode_compare_eq(left, right);
10375 if (op == Py_EQ)
10376 v = TEST_COND(result);
10377 else
10378 v = TEST_COND(!result);
10379 }
10380 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010381 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010382
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010383 /* Convert the return value to a Boolean */
10384 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010385 case Py_LE:
10386 v = TEST_COND(result <= 0);
10387 break;
10388 case Py_GE:
10389 v = TEST_COND(result >= 0);
10390 break;
10391 case Py_LT:
10392 v = TEST_COND(result == -1);
10393 break;
10394 case Py_GT:
10395 v = TEST_COND(result == 1);
10396 break;
10397 default:
10398 PyErr_BadArgument();
10399 return NULL;
10400 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010401 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010402 Py_INCREF(v);
10403 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010404}
10405
Alexander Belopolsky40018472011-02-26 01:02:56 +000010406int
10407PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010408{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010409 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 int kind1, kind2, kind;
10411 void *buf1, *buf2;
10412 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010413 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010414
10415 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010416 sub = PyUnicode_FromObject(element);
10417 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010418 PyErr_Format(PyExc_TypeError,
10419 "'in <string>' requires string as left operand, not %s",
10420 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010421 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010422 }
10423
Thomas Wouters477c8d52006-05-27 19:21:47 +000010424 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010425 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010426 Py_DECREF(sub);
10427 return -1;
10428 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010429 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10430 Py_DECREF(sub);
10431 Py_DECREF(str);
10432 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 kind1 = PyUnicode_KIND(str);
10435 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010436 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 buf1 = PyUnicode_DATA(str);
10438 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010439 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010440 if (kind2 > kind) {
10441 Py_DECREF(sub);
10442 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010443 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010444 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010445 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 if (!buf2) {
10448 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010449 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 return -1;
10451 }
10452 len1 = PyUnicode_GET_LENGTH(str);
10453 len2 = PyUnicode_GET_LENGTH(sub);
10454
Benjamin Petersonead6b532011-12-20 17:23:42 -060010455 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 case PyUnicode_1BYTE_KIND:
10457 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10458 break;
10459 case PyUnicode_2BYTE_KIND:
10460 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10461 break;
10462 case PyUnicode_4BYTE_KIND:
10463 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10464 break;
10465 default:
10466 result = -1;
10467 assert(0);
10468 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010469
10470 Py_DECREF(str);
10471 Py_DECREF(sub);
10472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 if (kind2 != kind)
10474 PyMem_Free(buf2);
10475
Guido van Rossum403d68b2000-03-13 15:55:09 +000010476 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010477}
10478
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479/* Concat to string or Unicode object giving a new Unicode object. */
10480
Alexander Belopolsky40018472011-02-26 01:02:56 +000010481PyObject *
10482PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010485 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010486 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487
10488 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010491 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010494 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495
10496 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010497 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010498 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010501 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010504 }
10505
Victor Stinner488fa492011-12-12 00:01:39 +010010506 u_len = PyUnicode_GET_LENGTH(u);
10507 v_len = PyUnicode_GET_LENGTH(v);
10508 if (u_len > PY_SSIZE_T_MAX - v_len) {
10509 PyErr_SetString(PyExc_OverflowError,
10510 "strings are too large to concat");
10511 goto onError;
10512 }
10513 new_len = u_len + v_len;
10514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010516 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010517 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010520 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010522 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010523 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10524 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525 Py_DECREF(u);
10526 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010527 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529
Benjamin Peterson29060642009-01-31 22:14:21 +000010530 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531 Py_XDECREF(u);
10532 Py_XDECREF(v);
10533 return NULL;
10534}
10535
Walter Dörwald1ab83302007-05-18 17:15:44 +000010536void
Victor Stinner23e56682011-10-03 03:54:37 +020010537PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010538{
Victor Stinner23e56682011-10-03 03:54:37 +020010539 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010540 Py_UCS4 maxchar, maxchar2;
10541 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010542
10543 if (p_left == NULL) {
10544 if (!PyErr_Occurred())
10545 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010546 return;
10547 }
Victor Stinner23e56682011-10-03 03:54:37 +020010548 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010549 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010550 if (!PyErr_Occurred())
10551 PyErr_BadInternalCall();
10552 goto error;
10553 }
10554
Benjamin Petersonbac79492012-01-14 13:34:47 -050010555 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010556 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010557 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010558 goto error;
10559
Victor Stinner488fa492011-12-12 00:01:39 +010010560 /* Shortcuts */
10561 if (left == unicode_empty) {
10562 Py_DECREF(left);
10563 Py_INCREF(right);
10564 *p_left = right;
10565 return;
10566 }
10567 if (right == unicode_empty)
10568 return;
10569
10570 left_len = PyUnicode_GET_LENGTH(left);
10571 right_len = PyUnicode_GET_LENGTH(right);
10572 if (left_len > PY_SSIZE_T_MAX - right_len) {
10573 PyErr_SetString(PyExc_OverflowError,
10574 "strings are too large to concat");
10575 goto error;
10576 }
10577 new_len = left_len + right_len;
10578
10579 if (unicode_modifiable(left)
10580 && PyUnicode_CheckExact(right)
10581 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010582 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10583 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010584 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010585 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010586 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10587 {
10588 /* append inplace */
10589 if (unicode_resize(p_left, new_len) != 0) {
10590 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10591 * deallocated so it cannot be put back into
10592 * 'variable'. The MemoryError is raised when there
10593 * is no value in 'variable', which might (very
10594 * remotely) be a cause of incompatibilities.
10595 */
10596 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010597 }
Victor Stinner488fa492011-12-12 00:01:39 +010010598 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010599 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010600 }
Victor Stinner488fa492011-12-12 00:01:39 +010010601 else {
10602 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10603 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010604 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010605
Victor Stinner488fa492011-12-12 00:01:39 +010010606 /* Concat the two Unicode strings */
10607 res = PyUnicode_New(new_len, maxchar);
10608 if (res == NULL)
10609 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010610 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10611 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010612 Py_DECREF(left);
10613 *p_left = res;
10614 }
10615 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010616 return;
10617
10618error:
Victor Stinner488fa492011-12-12 00:01:39 +010010619 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010620}
10621
10622void
10623PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10624{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010625 PyUnicode_Append(pleft, right);
10626 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010627}
10628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010629PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010630 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010633string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010634interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635
10636static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010637unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010639 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010640 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010641 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010642 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 int kind1, kind2, kind;
10644 void *buf1, *buf2;
10645 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646
Jesus Ceaac451502011-04-20 17:09:23 +020010647 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10648 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010649 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 kind1 = PyUnicode_KIND(self);
10652 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010653 if (kind2 > kind1)
10654 return PyLong_FromLong(0);
10655 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 buf1 = PyUnicode_DATA(self);
10657 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010659 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 if (!buf2) {
10661 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 return NULL;
10663 }
10664 len1 = PyUnicode_GET_LENGTH(self);
10665 len2 = PyUnicode_GET_LENGTH(substring);
10666
10667 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010668 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 case PyUnicode_1BYTE_KIND:
10670 iresult = ucs1lib_count(
10671 ((Py_UCS1*)buf1) + start, end - start,
10672 buf2, len2, PY_SSIZE_T_MAX
10673 );
10674 break;
10675 case PyUnicode_2BYTE_KIND:
10676 iresult = ucs2lib_count(
10677 ((Py_UCS2*)buf1) + start, end - start,
10678 buf2, len2, PY_SSIZE_T_MAX
10679 );
10680 break;
10681 case PyUnicode_4BYTE_KIND:
10682 iresult = ucs4lib_count(
10683 ((Py_UCS4*)buf1) + start, end - start,
10684 buf2, len2, PY_SSIZE_T_MAX
10685 );
10686 break;
10687 default:
10688 assert(0); iresult = 0;
10689 }
10690
10691 result = PyLong_FromSsize_t(iresult);
10692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 if (kind2 != kind)
10694 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695
10696 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697
Guido van Rossumd57fd912000-03-10 22:53:23 +000010698 return result;
10699}
10700
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010701PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010702 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010704Encode S using the codec registered for encoding. Default encoding\n\
10705is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010706handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010707a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10708'xmlcharrefreplace' as well as any other name registered with\n\
10709codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710
10711static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010712unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010714 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 char *encoding = NULL;
10716 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010717
Benjamin Peterson308d6372009-09-18 21:42:35 +000010718 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10719 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010721 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010722}
10723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010724PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726\n\
10727Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010728If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729
10730static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010731unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010733 Py_ssize_t i, j, line_pos, src_len, incr;
10734 Py_UCS4 ch;
10735 PyObject *u;
10736 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010738 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010739 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740
10741 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743
Antoine Pitrou22425222011-10-04 19:10:51 +020010744 if (PyUnicode_READY(self) == -1)
10745 return NULL;
10746
Thomas Wouters7e474022000-07-16 12:04:32 +000010747 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010748 src_len = PyUnicode_GET_LENGTH(self);
10749 i = j = line_pos = 0;
10750 kind = PyUnicode_KIND(self);
10751 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010752 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010753 for (; i < src_len; i++) {
10754 ch = PyUnicode_READ(kind, src_data, i);
10755 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010756 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010757 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010758 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010759 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010760 goto overflow;
10761 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010762 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010763 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010766 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010767 goto overflow;
10768 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010770 if (ch == '\n' || ch == '\r')
10771 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010773 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010774 if (!found)
10775 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010776
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010778 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779 if (!u)
10780 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010781 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782
Antoine Pitroue71d5742011-10-04 15:55:09 +020010783 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784
Antoine Pitroue71d5742011-10-04 15:55:09 +020010785 for (; i < src_len; i++) {
10786 ch = PyUnicode_READ(kind, src_data, i);
10787 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010789 incr = tabsize - (line_pos % tabsize);
10790 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010791 FILL(kind, dest_data, ' ', j, incr);
10792 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010793 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010794 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010795 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010796 line_pos++;
10797 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010798 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010799 if (ch == '\n' || ch == '\r')
10800 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010802 }
10803 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010804 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010805
Antoine Pitroue71d5742011-10-04 15:55:09 +020010806 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010807 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809}
10810
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010811PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813\n\
10814Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010815such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816arguments start and end are interpreted as in slice notation.\n\
10817\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010818Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819
10820static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010821unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010823 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010824 Py_ssize_t start;
10825 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010826 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827
Jesus Ceaac451502011-04-20 17:09:23 +020010828 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10829 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 if (PyUnicode_READY(self) == -1)
10833 return NULL;
10834 if (PyUnicode_READY(substring) == -1)
10835 return NULL;
10836
Victor Stinner7931d9a2011-11-04 00:22:48 +010010837 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838
10839 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (result == -2)
10842 return NULL;
10843
Christian Heimes217cfd12007-12-02 14:31:20 +000010844 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845}
10846
10847static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010848unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010850 void *data;
10851 enum PyUnicode_Kind kind;
10852 Py_UCS4 ch;
10853 PyObject *res;
10854
10855 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10856 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010858 }
10859 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10860 PyErr_SetString(PyExc_IndexError, "string index out of range");
10861 return NULL;
10862 }
10863 kind = PyUnicode_KIND(self);
10864 data = PyUnicode_DATA(self);
10865 ch = PyUnicode_READ(kind, data, index);
10866 if (ch < 256)
10867 return get_latin1_char(ch);
10868
10869 res = PyUnicode_New(1, ch);
10870 if (res == NULL)
10871 return NULL;
10872 kind = PyUnicode_KIND(res);
10873 data = PyUnicode_DATA(res);
10874 PyUnicode_WRITE(kind, data, 0, ch);
10875 assert(_PyUnicode_CheckConsistency(res, 1));
10876 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877}
10878
Guido van Rossumc2504932007-09-18 19:42:40 +000010879/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010880 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010881static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010882unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883{
Guido van Rossumc2504932007-09-18 19:42:40 +000010884 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080010885 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000010886
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010887#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010888 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010889#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 if (_PyUnicode_HASH(self) != -1)
10891 return _PyUnicode_HASH(self);
10892 if (PyUnicode_READY(self) == -1)
10893 return -1;
10894 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010010895 /*
10896 We make the hash of the empty string be 0, rather than using
10897 (prefix ^ suffix), since this slightly obfuscates the hash secret
10898 */
10899 if (len == 0) {
10900 _PyUnicode_HASH(self) = 0;
10901 return 0;
10902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903
10904 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010010905#define HASH(P) \
10906 x ^= (Py_uhash_t) *P << 7; \
10907 while (--len >= 0) \
10908 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909
Georg Brandl2fb477c2012-02-21 00:33:36 +010010910 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010911 switch (PyUnicode_KIND(self)) {
10912 case PyUnicode_1BYTE_KIND: {
10913 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10914 HASH(c);
10915 break;
10916 }
10917 case PyUnicode_2BYTE_KIND: {
10918 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10919 HASH(s);
10920 break;
10921 }
10922 default: {
10923 Py_UCS4 *l;
10924 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10925 "Impossible switch case in unicode_hash");
10926 l = PyUnicode_4BYTE_DATA(self);
10927 HASH(l);
10928 break;
10929 }
10930 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010010931 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10932 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933
Guido van Rossumc2504932007-09-18 19:42:40 +000010934 if (x == -1)
10935 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010937 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010938}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010939#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010941PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010942 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010944Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945
10946static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010949 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010950 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010951 Py_ssize_t start;
10952 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
Jesus Ceaac451502011-04-20 17:09:23 +020010954 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10955 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (PyUnicode_READY(self) == -1)
10959 return NULL;
10960 if (PyUnicode_READY(substring) == -1)
10961 return NULL;
10962
Victor Stinner7931d9a2011-11-04 00:22:48 +010010963 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
10965 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (result == -2)
10968 return NULL;
10969
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 if (result < 0) {
10971 PyErr_SetString(PyExc_ValueError, "substring not found");
10972 return NULL;
10973 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010974
Christian Heimes217cfd12007-12-02 14:31:20 +000010975 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976}
10977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010978PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010981Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010982at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983
10984static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010985unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 Py_ssize_t i, length;
10988 int kind;
10989 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010990 int cased;
10991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 if (PyUnicode_READY(self) == -1)
10993 return NULL;
10994 length = PyUnicode_GET_LENGTH(self);
10995 kind = PyUnicode_KIND(self);
10996 data = PyUnicode_DATA(self);
10997
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 if (length == 1)
11000 return PyBool_FromLong(
11001 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011003 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011005 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011006
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 for (i = 0; i < length; i++) {
11009 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011010
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11012 return PyBool_FromLong(0);
11013 else if (!cased && Py_UNICODE_ISLOWER(ch))
11014 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011016 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017}
11018
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011019PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011020 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011022Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011023at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024
11025static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011026unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 Py_ssize_t i, length;
11029 int kind;
11030 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 int cased;
11032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 if (PyUnicode_READY(self) == -1)
11034 return NULL;
11035 length = PyUnicode_GET_LENGTH(self);
11036 kind = PyUnicode_KIND(self);
11037 data = PyUnicode_DATA(self);
11038
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 if (length == 1)
11041 return PyBool_FromLong(
11042 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011043
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011044 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011046 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011047
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 for (i = 0; i < length; i++) {
11050 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011051
Benjamin Peterson29060642009-01-31 22:14:21 +000011052 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11053 return PyBool_FromLong(0);
11054 else if (!cased && Py_UNICODE_ISUPPER(ch))
11055 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011057 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058}
11059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011060PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011061 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011063Return True if S is a titlecased string and there is at least one\n\
11064character in S, i.e. upper- and titlecase characters may only\n\
11065follow uncased characters and lowercase characters only cased ones.\n\
11066Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
11068static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011069unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 Py_ssize_t i, length;
11072 int kind;
11073 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074 int cased, previous_is_cased;
11075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 if (PyUnicode_READY(self) == -1)
11077 return NULL;
11078 length = PyUnicode_GET_LENGTH(self);
11079 kind = PyUnicode_KIND(self);
11080 data = PyUnicode_DATA(self);
11081
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 if (length == 1) {
11084 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11085 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11086 (Py_UNICODE_ISUPPER(ch) != 0));
11087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011089 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011090 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011091 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011092
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093 cased = 0;
11094 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 for (i = 0; i < length; i++) {
11096 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011097
Benjamin Peterson29060642009-01-31 22:14:21 +000011098 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11099 if (previous_is_cased)
11100 return PyBool_FromLong(0);
11101 previous_is_cased = 1;
11102 cased = 1;
11103 }
11104 else if (Py_UNICODE_ISLOWER(ch)) {
11105 if (!previous_is_cased)
11106 return PyBool_FromLong(0);
11107 previous_is_cased = 1;
11108 cased = 1;
11109 }
11110 else
11111 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011113 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114}
11115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011116PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011119Return True if all characters in S are whitespace\n\
11120and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121
11122static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011123unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 Py_ssize_t i, length;
11126 int kind;
11127 void *data;
11128
11129 if (PyUnicode_READY(self) == -1)
11130 return NULL;
11131 length = PyUnicode_GET_LENGTH(self);
11132 kind = PyUnicode_KIND(self);
11133 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011134
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 if (length == 1)
11137 return PyBool_FromLong(
11138 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011140 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011142 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 for (i = 0; i < length; i++) {
11145 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011146 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011147 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011149 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150}
11151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011152PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011153 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011154\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011155Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011156and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011157
11158static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011159unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011160{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161 Py_ssize_t i, length;
11162 int kind;
11163 void *data;
11164
11165 if (PyUnicode_READY(self) == -1)
11166 return NULL;
11167 length = PyUnicode_GET_LENGTH(self);
11168 kind = PyUnicode_KIND(self);
11169 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011170
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011171 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (length == 1)
11173 return PyBool_FromLong(
11174 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011175
11176 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011178 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 for (i = 0; i < length; i++) {
11181 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011182 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011183 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011184 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011185}
11186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011187PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011188 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011189\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011190Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011191and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011192
11193static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011194unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011195{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 int kind;
11197 void *data;
11198 Py_ssize_t len, i;
11199
11200 if (PyUnicode_READY(self) == -1)
11201 return NULL;
11202
11203 kind = PyUnicode_KIND(self);
11204 data = PyUnicode_DATA(self);
11205 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011206
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011207 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 if (len == 1) {
11209 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11210 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11211 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011212
11213 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 for (i = 0; i < len; i++) {
11218 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011219 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011220 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011221 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011222 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011223}
11224
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011225PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011226 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011228Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011229False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230
11231static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011232unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 Py_ssize_t i, length;
11235 int kind;
11236 void *data;
11237
11238 if (PyUnicode_READY(self) == -1)
11239 return NULL;
11240 length = PyUnicode_GET_LENGTH(self);
11241 kind = PyUnicode_KIND(self);
11242 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 if (length == 1)
11246 return PyBool_FromLong(
11247 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011249 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011251 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 for (i = 0; i < length; i++) {
11254 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011257 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258}
11259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011260PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011263Return True if all characters in S are digits\n\
11264and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
11266static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011267unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011269 Py_ssize_t i, length;
11270 int kind;
11271 void *data;
11272
11273 if (PyUnicode_READY(self) == -1)
11274 return NULL;
11275 length = PyUnicode_GET_LENGTH(self);
11276 kind = PyUnicode_KIND(self);
11277 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 if (length == 1) {
11281 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11282 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11283 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011285 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 for (i = 0; i < length; i++) {
11290 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011293 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294}
11295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011296PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011297 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011299Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011300False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
11302static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011303unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305 Py_ssize_t i, length;
11306 int kind;
11307 void *data;
11308
11309 if (PyUnicode_READY(self) == -1)
11310 return NULL;
11311 length = PyUnicode_GET_LENGTH(self);
11312 kind = PyUnicode_KIND(self);
11313 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 if (length == 1)
11317 return PyBool_FromLong(
11318 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011320 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011324 for (i = 0; i < length; i++) {
11325 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011328 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329}
11330
Martin v. Löwis47383402007-08-15 07:32:56 +000011331int
11332PyUnicode_IsIdentifier(PyObject *self)
11333{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 int kind;
11335 void *data;
11336 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011337 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 if (PyUnicode_READY(self) == -1) {
11340 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011341 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 }
11343
11344 /* Special case for empty strings */
11345 if (PyUnicode_GET_LENGTH(self) == 0)
11346 return 0;
11347 kind = PyUnicode_KIND(self);
11348 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011349
11350 /* PEP 3131 says that the first character must be in
11351 XID_Start and subsequent characters in XID_Continue,
11352 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011353 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011354 letters, digits, underscore). However, given the current
11355 definition of XID_Start and XID_Continue, it is sufficient
11356 to check just for these, except that _ must be allowed
11357 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011359 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011360 return 0;
11361
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011362 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011364 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011365 return 1;
11366}
11367
11368PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011370\n\
11371Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011372to the language definition.\n\
11373\n\
11374Use keyword.iskeyword() to test for reserved identifiers\n\
11375such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011376
11377static PyObject*
11378unicode_isidentifier(PyObject *self)
11379{
11380 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11381}
11382
Georg Brandl559e5d72008-06-11 18:37:52 +000011383PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011384 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011385\n\
11386Return True if all characters in S are considered\n\
11387printable in repr() or S is empty, False otherwise.");
11388
11389static PyObject*
11390unicode_isprintable(PyObject *self)
11391{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 Py_ssize_t i, length;
11393 int kind;
11394 void *data;
11395
11396 if (PyUnicode_READY(self) == -1)
11397 return NULL;
11398 length = PyUnicode_GET_LENGTH(self);
11399 kind = PyUnicode_KIND(self);
11400 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011401
11402 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 if (length == 1)
11404 return PyBool_FromLong(
11405 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407 for (i = 0; i < length; i++) {
11408 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011409 Py_RETURN_FALSE;
11410 }
11411 }
11412 Py_RETURN_TRUE;
11413}
11414
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011415PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011416 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417\n\
11418Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011419iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420
11421static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011422unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011424 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425}
11426
Martin v. Löwis18e16552006-02-15 17:27:45 +000011427static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011428unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 if (PyUnicode_READY(self) == -1)
11431 return -1;
11432 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433}
11434
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011435PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011438Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011439done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
11441static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011442unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011444 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 Py_UCS4 fillchar = ' ';
11446
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011447 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448 return NULL;
11449
Benjamin Petersonbac79492012-01-14 13:34:47 -050011450 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011451 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452
Victor Stinnerc4b49542011-12-11 22:44:26 +010011453 if (PyUnicode_GET_LENGTH(self) >= width)
11454 return unicode_result_unchanged(self);
11455
11456 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457}
11458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011459PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011460 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011462Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
11464static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011465unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011467 if (PyUnicode_READY(self) == -1)
11468 return NULL;
11469 if (PyUnicode_IS_ASCII(self))
11470 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011471 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472}
11473
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011474#define LEFTSTRIP 0
11475#define RIGHTSTRIP 1
11476#define BOTHSTRIP 2
11477
11478/* Arrays indexed by above */
11479static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11480
11481#define STRIPNAME(i) (stripformat[i]+3)
11482
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011483/* externally visible for str.strip(unicode) */
11484PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011485_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 void *data;
11488 int kind;
11489 Py_ssize_t i, j, len;
11490 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11493 return NULL;
11494
11495 kind = PyUnicode_KIND(self);
11496 data = PyUnicode_DATA(self);
11497 len = PyUnicode_GET_LENGTH(self);
11498 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11499 PyUnicode_DATA(sepobj),
11500 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011501
Benjamin Peterson14339b62009-01-31 16:36:08 +000011502 i = 0;
11503 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011504 while (i < len &&
11505 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 i++;
11507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011508 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011509
Benjamin Peterson14339b62009-01-31 16:36:08 +000011510 j = len;
11511 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 do {
11513 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 } while (j >= i &&
11515 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011517 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011518
Victor Stinner7931d9a2011-11-04 00:22:48 +010011519 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520}
11521
11522PyObject*
11523PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11524{
11525 unsigned char *data;
11526 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011527 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528
Victor Stinnerde636f32011-10-01 03:55:54 +020011529 if (PyUnicode_READY(self) == -1)
11530 return NULL;
11531
Victor Stinner684d5fd2012-05-03 02:32:34 +020011532 length = PyUnicode_GET_LENGTH(self);
11533 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011534
Victor Stinner684d5fd2012-05-03 02:32:34 +020011535 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011536 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537
Victor Stinnerde636f32011-10-01 03:55:54 +020011538 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011539 PyErr_SetString(PyExc_IndexError, "string index out of range");
11540 return NULL;
11541 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011542 if (start >= length || end < start)
11543 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011544
Victor Stinner684d5fd2012-05-03 02:32:34 +020011545 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011546 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011547 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011548 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011549 }
11550 else {
11551 kind = PyUnicode_KIND(self);
11552 data = PyUnicode_1BYTE_DATA(self);
11553 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011554 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011555 length);
11556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558
11559static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011560do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 int kind;
11563 void *data;
11564 Py_ssize_t len, i, j;
11565
11566 if (PyUnicode_READY(self) == -1)
11567 return NULL;
11568
11569 kind = PyUnicode_KIND(self);
11570 data = PyUnicode_DATA(self);
11571 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011572
Benjamin Peterson14339b62009-01-31 16:36:08 +000011573 i = 0;
11574 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011576 i++;
11577 }
11578 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011579
Benjamin Peterson14339b62009-01-31 16:36:08 +000011580 j = len;
11581 if (striptype != LEFTSTRIP) {
11582 do {
11583 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011585 j++;
11586 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011587
Victor Stinner7931d9a2011-11-04 00:22:48 +010011588 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589}
11590
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011591
11592static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011593do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011594{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011595 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011596
Benjamin Peterson14339b62009-01-31 16:36:08 +000011597 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11598 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011599
Benjamin Peterson14339b62009-01-31 16:36:08 +000011600 if (sep != NULL && sep != Py_None) {
11601 if (PyUnicode_Check(sep))
11602 return _PyUnicode_XStrip(self, striptype, sep);
11603 else {
11604 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011605 "%s arg must be None or str",
11606 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011607 return NULL;
11608 }
11609 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011610
Benjamin Peterson14339b62009-01-31 16:36:08 +000011611 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011612}
11613
11614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011615PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011616 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011617\n\
11618Return a copy of the string S with leading and trailing\n\
11619whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011620If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011621
11622static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011623unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011624{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011625 if (PyTuple_GET_SIZE(args) == 0)
11626 return do_strip(self, BOTHSTRIP); /* Common case */
11627 else
11628 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011629}
11630
11631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011632PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011634\n\
11635Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011636If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011637
11638static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011639unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011640{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011641 if (PyTuple_GET_SIZE(args) == 0)
11642 return do_strip(self, LEFTSTRIP); /* Common case */
11643 else
11644 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011645}
11646
11647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011648PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011650\n\
11651Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011652If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011653
11654static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011655unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011656{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011657 if (PyTuple_GET_SIZE(args) == 0)
11658 return do_strip(self, RIGHTSTRIP); /* Common case */
11659 else
11660 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011661}
11662
11663
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011665unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011667 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669
Serhiy Storchaka05997252013-01-26 12:14:02 +020011670 if (len < 1)
11671 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672
Victor Stinnerc4b49542011-12-11 22:44:26 +010011673 /* no repeat, return original string */
11674 if (len == 1)
11675 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011676
Benjamin Petersonbac79492012-01-14 13:34:47 -050011677 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 return NULL;
11679
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011680 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011681 PyErr_SetString(PyExc_OverflowError,
11682 "repeated string is too long");
11683 return NULL;
11684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011686
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011687 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688 if (!u)
11689 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011690 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 if (PyUnicode_GET_LENGTH(str) == 1) {
11693 const int kind = PyUnicode_KIND(str);
11694 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011695 if (kind == PyUnicode_1BYTE_KIND) {
11696 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011697 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011698 }
11699 else if (kind == PyUnicode_2BYTE_KIND) {
11700 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011701 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011702 ucs2[n] = fill_char;
11703 } else {
11704 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11705 assert(kind == PyUnicode_4BYTE_KIND);
11706 for (n = 0; n < len; ++n)
11707 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 }
11710 else {
11711 /* number of characters copied this far */
11712 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011713 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 char *to = (char *) PyUnicode_DATA(u);
11715 Py_MEMCPY(to, PyUnicode_DATA(str),
11716 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011717 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 n = (done <= nchars-done) ? done : nchars-done;
11719 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011720 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011721 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722 }
11723
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011724 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011725 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726}
11727
Alexander Belopolsky40018472011-02-26 01:02:56 +000011728PyObject *
11729PyUnicode_Replace(PyObject *obj,
11730 PyObject *subobj,
11731 PyObject *replobj,
11732 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733{
11734 PyObject *self;
11735 PyObject *str1;
11736 PyObject *str2;
11737 PyObject *result;
11738
11739 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011740 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011743 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011744 Py_DECREF(self);
11745 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746 }
11747 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011748 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011749 Py_DECREF(self);
11750 Py_DECREF(str1);
11751 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011753 if (PyUnicode_READY(self) == -1 ||
11754 PyUnicode_READY(str1) == -1 ||
11755 PyUnicode_READY(str2) == -1)
11756 result = NULL;
11757 else
11758 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759 Py_DECREF(self);
11760 Py_DECREF(str1);
11761 Py_DECREF(str2);
11762 return result;
11763}
11764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011765PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011766 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767\n\
11768Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011769old replaced by new. If the optional argument count is\n\
11770given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771
11772static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 PyObject *str1;
11776 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011777 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778 PyObject *result;
11779
Martin v. Löwis18e16552006-02-15 17:27:45 +000011780 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011782 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011785 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 return NULL;
11787 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011788 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011789 Py_DECREF(str1);
11790 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011791 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011792 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11793 result = NULL;
11794 else
11795 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796
11797 Py_DECREF(str1);
11798 Py_DECREF(str2);
11799 return result;
11800}
11801
Alexander Belopolsky40018472011-02-26 01:02:56 +000011802static PyObject *
11803unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011805 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 Py_ssize_t isize;
11807 Py_ssize_t osize, squote, dquote, i, o;
11808 Py_UCS4 max, quote;
11809 int ikind, okind;
11810 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011813 return NULL;
11814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 isize = PyUnicode_GET_LENGTH(unicode);
11816 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 /* Compute length of output, quote characters, and
11819 maximum character */
11820 osize = 2; /* quotes */
11821 max = 127;
11822 squote = dquote = 0;
11823 ikind = PyUnicode_KIND(unicode);
11824 for (i = 0; i < isize; i++) {
11825 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11826 switch (ch) {
11827 case '\'': squote++; osize++; break;
11828 case '"': dquote++; osize++; break;
11829 case '\\': case '\t': case '\r': case '\n':
11830 osize += 2; break;
11831 default:
11832 /* Fast-path ASCII */
11833 if (ch < ' ' || ch == 0x7f)
11834 osize += 4; /* \xHH */
11835 else if (ch < 0x7f)
11836 osize++;
11837 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11838 osize++;
11839 max = ch > max ? ch : max;
11840 }
11841 else if (ch < 0x100)
11842 osize += 4; /* \xHH */
11843 else if (ch < 0x10000)
11844 osize += 6; /* \uHHHH */
11845 else
11846 osize += 10; /* \uHHHHHHHH */
11847 }
11848 }
11849
11850 quote = '\'';
11851 if (squote) {
11852 if (dquote)
11853 /* Both squote and dquote present. Use squote,
11854 and escape them */
11855 osize += squote;
11856 else
11857 quote = '"';
11858 }
11859
11860 repr = PyUnicode_New(osize, max);
11861 if (repr == NULL)
11862 return NULL;
11863 okind = PyUnicode_KIND(repr);
11864 odata = PyUnicode_DATA(repr);
11865
11866 PyUnicode_WRITE(okind, odata, 0, quote);
11867 PyUnicode_WRITE(okind, odata, osize-1, quote);
11868
11869 for (i = 0, o = 1; i < isize; i++) {
11870 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011871
11872 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 if ((ch == quote) || (ch == '\\')) {
11874 PyUnicode_WRITE(okind, odata, o++, '\\');
11875 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011876 continue;
11877 }
11878
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011880 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 PyUnicode_WRITE(okind, odata, o++, '\\');
11882 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011883 }
11884 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011885 PyUnicode_WRITE(okind, odata, o++, '\\');
11886 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011887 }
11888 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 PyUnicode_WRITE(okind, odata, o++, '\\');
11890 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011891 }
11892
11893 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011894 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 PyUnicode_WRITE(okind, odata, o++, '\\');
11896 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011897 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11898 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011899 }
11900
Georg Brandl559e5d72008-06-11 18:37:52 +000011901 /* Copy ASCII characters as-is */
11902 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011904 }
11905
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011907 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011908 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011909 (categories Z* and C* except ASCII space)
11910 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011912 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000011913 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011916 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11917 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011918 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011919 /* Map 16-bit characters to '\uxxxx' */
11920 else if (ch <= 0xffff) {
11921 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011922 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11923 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11924 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11925 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011926 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011927 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011928 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011929 PyUnicode_WRITE(okind, odata, o++, 'U');
11930 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11931 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11932 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11933 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020011934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11936 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11937 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011938 }
11939 }
11940 /* Copy characters as-is */
11941 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011943 }
11944 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011947 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011948 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949}
11950
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011951PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011952 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953\n\
11954Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011955such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956arguments start and end are interpreted as in slice notation.\n\
11957\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011958Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959
11960static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011963 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011964 Py_ssize_t start;
11965 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011966 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967
Jesus Ceaac451502011-04-20 17:09:23 +020011968 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11969 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011970 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 if (PyUnicode_READY(self) == -1)
11973 return NULL;
11974 if (PyUnicode_READY(substring) == -1)
11975 return NULL;
11976
Victor Stinner7931d9a2011-11-04 00:22:48 +010011977 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978
11979 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 if (result == -2)
11982 return NULL;
11983
Christian Heimes217cfd12007-12-02 14:31:20 +000011984 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985}
11986
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011987PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011988 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011990Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991
11992static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011995 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011996 Py_ssize_t start;
11997 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011998 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999
Jesus Ceaac451502011-04-20 17:09:23 +020012000 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12001 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012002 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 if (PyUnicode_READY(self) == -1)
12005 return NULL;
12006 if (PyUnicode_READY(substring) == -1)
12007 return NULL;
12008
Victor Stinner7931d9a2011-11-04 00:22:48 +010012009 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010
12011 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 if (result == -2)
12014 return NULL;
12015
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016 if (result < 0) {
12017 PyErr_SetString(PyExc_ValueError, "substring not found");
12018 return NULL;
12019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020
Christian Heimes217cfd12007-12-02 14:31:20 +000012021 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022}
12023
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012024PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012025 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012027Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012028done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029
12030static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012031unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012033 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034 Py_UCS4 fillchar = ' ';
12035
Victor Stinnere9a29352011-10-01 02:14:59 +020012036 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012038
Benjamin Petersonbac79492012-01-14 13:34:47 -050012039 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040 return NULL;
12041
Victor Stinnerc4b49542011-12-11 22:44:26 +010012042 if (PyUnicode_GET_LENGTH(self) >= width)
12043 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044
Victor Stinnerc4b49542011-12-11 22:44:26 +010012045 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046}
12047
Alexander Belopolsky40018472011-02-26 01:02:56 +000012048PyObject *
12049PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050{
12051 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012052
Guido van Rossumd57fd912000-03-10 22:53:23 +000012053 s = PyUnicode_FromObject(s);
12054 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012055 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012056 if (sep != NULL) {
12057 sep = PyUnicode_FromObject(sep);
12058 if (sep == NULL) {
12059 Py_DECREF(s);
12060 return NULL;
12061 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062 }
12063
Victor Stinner9310abb2011-10-05 00:59:23 +020012064 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065
12066 Py_DECREF(s);
12067 Py_XDECREF(sep);
12068 return result;
12069}
12070
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012071PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012072 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073\n\
12074Return a list of the words in S, using sep as the\n\
12075delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012076splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012077whitespace string is a separator and empty strings are\n\
12078removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079
12080static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012081unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012083 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012085 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012087 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12088 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089 return NULL;
12090
12091 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012092 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012094 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012096 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097}
12098
Thomas Wouters477c8d52006-05-27 19:21:47 +000012099PyObject *
12100PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12101{
12102 PyObject* str_obj;
12103 PyObject* sep_obj;
12104 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 int kind1, kind2, kind;
12106 void *buf1 = NULL, *buf2 = NULL;
12107 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012108
12109 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012110 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012111 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012112 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012113 if (!sep_obj) {
12114 Py_DECREF(str_obj);
12115 return NULL;
12116 }
12117 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12118 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012119 Py_DECREF(str_obj);
12120 return NULL;
12121 }
12122
Victor Stinner14f8f022011-10-05 20:58:25 +020012123 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012125 kind = Py_MAX(kind1, kind2);
12126 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012128 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 if (!buf1)
12130 goto onError;
12131 buf2 = PyUnicode_DATA(sep_obj);
12132 if (kind2 != kind)
12133 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12134 if (!buf2)
12135 goto onError;
12136 len1 = PyUnicode_GET_LENGTH(str_obj);
12137 len2 = PyUnicode_GET_LENGTH(sep_obj);
12138
Benjamin Petersonead6b532011-12-20 17:23:42 -060012139 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012141 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12142 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12143 else
12144 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 break;
12146 case PyUnicode_2BYTE_KIND:
12147 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12148 break;
12149 case PyUnicode_4BYTE_KIND:
12150 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12151 break;
12152 default:
12153 assert(0);
12154 out = 0;
12155 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012156
12157 Py_DECREF(sep_obj);
12158 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 if (kind1 != kind)
12160 PyMem_Free(buf1);
12161 if (kind2 != kind)
12162 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012163
12164 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 onError:
12166 Py_DECREF(sep_obj);
12167 Py_DECREF(str_obj);
12168 if (kind1 != kind && buf1)
12169 PyMem_Free(buf1);
12170 if (kind2 != kind && buf2)
12171 PyMem_Free(buf2);
12172 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012173}
12174
12175
12176PyObject *
12177PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12178{
12179 PyObject* str_obj;
12180 PyObject* sep_obj;
12181 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 int kind1, kind2, kind;
12183 void *buf1 = NULL, *buf2 = NULL;
12184 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012185
12186 str_obj = PyUnicode_FromObject(str_in);
12187 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012188 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012189 sep_obj = PyUnicode_FromObject(sep_in);
12190 if (!sep_obj) {
12191 Py_DECREF(str_obj);
12192 return NULL;
12193 }
12194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 kind1 = PyUnicode_KIND(str_in);
12196 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012197 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 buf1 = PyUnicode_DATA(str_in);
12199 if (kind1 != kind)
12200 buf1 = _PyUnicode_AsKind(str_in, kind);
12201 if (!buf1)
12202 goto onError;
12203 buf2 = PyUnicode_DATA(sep_obj);
12204 if (kind2 != kind)
12205 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12206 if (!buf2)
12207 goto onError;
12208 len1 = PyUnicode_GET_LENGTH(str_obj);
12209 len2 = PyUnicode_GET_LENGTH(sep_obj);
12210
Benjamin Petersonead6b532011-12-20 17:23:42 -060012211 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012213 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12214 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12215 else
12216 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 break;
12218 case PyUnicode_2BYTE_KIND:
12219 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12220 break;
12221 case PyUnicode_4BYTE_KIND:
12222 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12223 break;
12224 default:
12225 assert(0);
12226 out = 0;
12227 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012228
12229 Py_DECREF(sep_obj);
12230 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (kind1 != kind)
12232 PyMem_Free(buf1);
12233 if (kind2 != kind)
12234 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012235
12236 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 onError:
12238 Py_DECREF(sep_obj);
12239 Py_DECREF(str_obj);
12240 if (kind1 != kind && buf1)
12241 PyMem_Free(buf1);
12242 if (kind2 != kind && buf2)
12243 PyMem_Free(buf2);
12244 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012245}
12246
12247PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012248 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012249\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012250Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012251the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012252found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012253
12254static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012255unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012256{
Victor Stinner9310abb2011-10-05 00:59:23 +020012257 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012258}
12259
12260PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012261 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012262\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012263Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012264the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012265separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012266
12267static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012268unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012269{
Victor Stinner9310abb2011-10-05 00:59:23 +020012270 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012271}
12272
Alexander Belopolsky40018472011-02-26 01:02:56 +000012273PyObject *
12274PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012275{
12276 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012277
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012278 s = PyUnicode_FromObject(s);
12279 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012280 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012281 if (sep != NULL) {
12282 sep = PyUnicode_FromObject(sep);
12283 if (sep == NULL) {
12284 Py_DECREF(s);
12285 return NULL;
12286 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012287 }
12288
Victor Stinner9310abb2011-10-05 00:59:23 +020012289 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012290
12291 Py_DECREF(s);
12292 Py_XDECREF(sep);
12293 return result;
12294}
12295
12296PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012297 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012298\n\
12299Return a list of the words in S, using sep as the\n\
12300delimiter string, starting at the end of the string and\n\
12301working to the front. If maxsplit is given, at most maxsplit\n\
12302splits are done. If sep is not specified, any whitespace string\n\
12303is a separator.");
12304
12305static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012306unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012307{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012308 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012309 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012310 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012311
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012312 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12313 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012314 return NULL;
12315
12316 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012317 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012318 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012319 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012320 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012321 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012322}
12323
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012324PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012325 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012326\n\
12327Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012328Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012329is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330
12331static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012332unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012334 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012335 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012337 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12338 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339 return NULL;
12340
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012341 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012342}
12343
12344static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012345PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012347 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348}
12349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012350PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012351 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352\n\
12353Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012354and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355
12356static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012357unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012359 if (PyUnicode_READY(self) == -1)
12360 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012361 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012362}
12363
Georg Brandlceee0772007-11-27 23:48:05 +000012364PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012366\n\
12367Return a translation table usable for str.translate().\n\
12368If there is only one argument, it must be a dictionary mapping Unicode\n\
12369ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012370Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012371If there are two arguments, they must be strings of equal length, and\n\
12372in the resulting dictionary, each character in x will be mapped to the\n\
12373character at the same position in y. If there is a third argument, it\n\
12374must be a string, whose characters will be mapped to None in the result.");
12375
12376static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012377unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012378{
12379 PyObject *x, *y = NULL, *z = NULL;
12380 PyObject *new = NULL, *key, *value;
12381 Py_ssize_t i = 0;
12382 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012383
Georg Brandlceee0772007-11-27 23:48:05 +000012384 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12385 return NULL;
12386 new = PyDict_New();
12387 if (!new)
12388 return NULL;
12389 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 int x_kind, y_kind, z_kind;
12391 void *x_data, *y_data, *z_data;
12392
Georg Brandlceee0772007-11-27 23:48:05 +000012393 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012394 if (!PyUnicode_Check(x)) {
12395 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12396 "be a string if there is a second argument");
12397 goto err;
12398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012400 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12401 "arguments must have equal length");
12402 goto err;
12403 }
12404 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 x_kind = PyUnicode_KIND(x);
12406 y_kind = PyUnicode_KIND(y);
12407 x_data = PyUnicode_DATA(x);
12408 y_data = PyUnicode_DATA(y);
12409 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12410 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012411 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012412 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012413 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012414 if (!value) {
12415 Py_DECREF(key);
12416 goto err;
12417 }
Georg Brandlceee0772007-11-27 23:48:05 +000012418 res = PyDict_SetItem(new, key, value);
12419 Py_DECREF(key);
12420 Py_DECREF(value);
12421 if (res < 0)
12422 goto err;
12423 }
12424 /* create entries for deleting chars in z */
12425 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 z_kind = PyUnicode_KIND(z);
12427 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012428 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012430 if (!key)
12431 goto err;
12432 res = PyDict_SetItem(new, key, Py_None);
12433 Py_DECREF(key);
12434 if (res < 0)
12435 goto err;
12436 }
12437 }
12438 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439 int kind;
12440 void *data;
12441
Georg Brandlceee0772007-11-27 23:48:05 +000012442 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012443 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012444 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12445 "to maketrans it must be a dict");
12446 goto err;
12447 }
12448 /* copy entries into the new dict, converting string keys to int keys */
12449 while (PyDict_Next(x, &i, &key, &value)) {
12450 if (PyUnicode_Check(key)) {
12451 /* convert string keys to integer keys */
12452 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012453 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012454 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12455 "table must be of length 1");
12456 goto err;
12457 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 kind = PyUnicode_KIND(key);
12459 data = PyUnicode_DATA(key);
12460 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012461 if (!newkey)
12462 goto err;
12463 res = PyDict_SetItem(new, newkey, value);
12464 Py_DECREF(newkey);
12465 if (res < 0)
12466 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012467 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012468 /* just keep integer keys */
12469 if (PyDict_SetItem(new, key, value) < 0)
12470 goto err;
12471 } else {
12472 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12473 "be strings or integers");
12474 goto err;
12475 }
12476 }
12477 }
12478 return new;
12479 err:
12480 Py_DECREF(new);
12481 return NULL;
12482}
12483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012484PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012485 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486\n\
12487Return a copy of the string S, where all characters have been mapped\n\
12488through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012489Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012490Unmapped characters are left untouched. Characters mapped to None\n\
12491are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492
12493static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497}
12498
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012499PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012500 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012502Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503
12504static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012505unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012507 if (PyUnicode_READY(self) == -1)
12508 return NULL;
12509 if (PyUnicode_IS_ASCII(self))
12510 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012511 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512}
12513
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012514PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012515 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012517Pad a numeric string S with zeros on the left, to fill a field\n\
12518of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519
12520static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012521unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012523 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012524 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012525 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 int kind;
12527 void *data;
12528 Py_UCS4 chr;
12529
Martin v. Löwis18e16552006-02-15 17:27:45 +000012530 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531 return NULL;
12532
Benjamin Petersonbac79492012-01-14 13:34:47 -050012533 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012534 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535
Victor Stinnerc4b49542011-12-11 22:44:26 +010012536 if (PyUnicode_GET_LENGTH(self) >= width)
12537 return unicode_result_unchanged(self);
12538
12539 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540
12541 u = pad(self, fill, 0, '0');
12542
Walter Dörwald068325e2002-04-15 13:36:47 +000012543 if (u == NULL)
12544 return NULL;
12545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 kind = PyUnicode_KIND(u);
12547 data = PyUnicode_DATA(u);
12548 chr = PyUnicode_READ(kind, data, fill);
12549
12550 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 PyUnicode_WRITE(kind, data, 0, chr);
12553 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554 }
12555
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012556 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012557 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559
12560#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012561static PyObject *
12562unicode__decimal2ascii(PyObject *self)
12563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012565}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566#endif
12567
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012568PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012569 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012571Return True if S starts with the specified prefix, False otherwise.\n\
12572With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012573With optional end, stop comparing S at that position.\n\
12574prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575
12576static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012577unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012578 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012580 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012581 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012582 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012583 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012584 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585
Jesus Ceaac451502011-04-20 17:09:23 +020012586 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012587 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012588 if (PyTuple_Check(subobj)) {
12589 Py_ssize_t i;
12590 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012591 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012592 if (substring == NULL)
12593 return NULL;
12594 result = tailmatch(self, substring, start, end, -1);
12595 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012596 if (result == -1)
12597 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012598 if (result) {
12599 Py_RETURN_TRUE;
12600 }
12601 }
12602 /* nothing matched */
12603 Py_RETURN_FALSE;
12604 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012605 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012606 if (substring == NULL) {
12607 if (PyErr_ExceptionMatches(PyExc_TypeError))
12608 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12609 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012611 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012612 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012614 if (result == -1)
12615 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012616 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617}
12618
12619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012620PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012621 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012623Return True if S ends with the specified suffix, False otherwise.\n\
12624With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012625With optional end, stop comparing S at that position.\n\
12626suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627
12628static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012629unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012630 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012632 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012633 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012634 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012635 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012636 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637
Jesus Ceaac451502011-04-20 17:09:23 +020012638 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012640 if (PyTuple_Check(subobj)) {
12641 Py_ssize_t i;
12642 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012643 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012645 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012646 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012647 result = tailmatch(self, substring, start, end, +1);
12648 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010012649 if (result == -1)
12650 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012651 if (result) {
12652 Py_RETURN_TRUE;
12653 }
12654 }
12655 Py_RETURN_FALSE;
12656 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012657 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012658 if (substring == NULL) {
12659 if (PyErr_ExceptionMatches(PyExc_TypeError))
12660 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12661 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012662 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012663 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012664 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010012665 if (result == -1)
12666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012668 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012669}
12670
Victor Stinner202fdca2012-05-07 12:47:02 +020012671Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012672_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012673{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012674 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012675 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12676 writer->data = PyUnicode_DATA(writer->buffer);
12677 writer->kind = PyUnicode_KIND(writer->buffer);
12678}
12679
Victor Stinnerd3f08822012-05-29 12:57:52 +020012680void
12681_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012682{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012683 memset(writer, 0, sizeof(*writer));
12684#ifdef Py_DEBUG
12685 writer->kind = 5; /* invalid kind */
12686#endif
12687 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012688 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012689}
12690
Victor Stinnerd3f08822012-05-29 12:57:52 +020012691int
12692_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12693 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012694{
12695 Py_ssize_t newlen;
12696 PyObject *newbuffer;
12697
Victor Stinnerd3f08822012-05-29 12:57:52 +020012698 assert(length > 0);
12699
Victor Stinner202fdca2012-05-07 12:47:02 +020012700 if (length > PY_SSIZE_T_MAX - writer->pos) {
12701 PyErr_NoMemory();
12702 return -1;
12703 }
12704 newlen = writer->pos + length;
12705
Victor Stinnerd3f08822012-05-29 12:57:52 +020012706 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012707 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012708 /* overallocate 25% to limit the number of resize */
12709 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12710 newlen += newlen / 4;
12711 if (newlen < writer->min_length)
12712 newlen = writer->min_length;
12713 }
12714 writer->buffer = PyUnicode_New(newlen, maxchar);
12715 if (writer->buffer == NULL)
12716 return -1;
12717 _PyUnicodeWriter_Update(writer);
12718 return 0;
12719 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012720
Victor Stinnerd3f08822012-05-29 12:57:52 +020012721 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012722 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012723 /* overallocate 25% to limit the number of resize */
12724 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12725 newlen += newlen / 4;
12726 if (newlen < writer->min_length)
12727 newlen = writer->min_length;
12728 }
12729
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012730 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012731 /* resize + widen */
12732 newbuffer = PyUnicode_New(newlen, maxchar);
12733 if (newbuffer == NULL)
12734 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012735 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12736 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012737 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012738 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012739 }
12740 else {
12741 newbuffer = resize_compact(writer->buffer, newlen);
12742 if (newbuffer == NULL)
12743 return -1;
12744 }
12745 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012746 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012747 }
12748 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012749 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012750 newbuffer = PyUnicode_New(writer->size, maxchar);
12751 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012752 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012753 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12754 writer->buffer, 0, writer->pos);
12755 Py_DECREF(writer->buffer);
12756 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012757 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012758 }
12759 return 0;
12760}
12761
Victor Stinnerd3f08822012-05-29 12:57:52 +020012762int
12763_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12764{
12765 Py_UCS4 maxchar;
12766 Py_ssize_t len;
12767
12768 if (PyUnicode_READY(str) == -1)
12769 return -1;
12770 len = PyUnicode_GET_LENGTH(str);
12771 if (len == 0)
12772 return 0;
12773 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12774 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012775 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012776 Py_INCREF(str);
12777 writer->buffer = str;
12778 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012779 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012780 writer->size = 0;
12781 writer->pos += len;
12782 return 0;
12783 }
12784 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12785 return -1;
12786 }
12787 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12788 str, 0, len);
12789 writer->pos += len;
12790 return 0;
12791}
12792
Victor Stinnere215d962012-10-06 23:03:36 +020012793int
Victor Stinnercfc4c132013-04-03 01:48:39 +020012794_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
12795 Py_ssize_t start, Py_ssize_t end)
12796{
12797 Py_UCS4 maxchar;
12798 Py_ssize_t len;
12799
12800 if (PyUnicode_READY(str) == -1)
12801 return -1;
12802
12803 assert(0 <= start);
12804 assert(end <= PyUnicode_GET_LENGTH(str));
12805 assert(start <= end);
12806
12807 if (end == 0)
12808 return 0;
12809
12810 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
12811 return _PyUnicodeWriter_WriteStr(writer, str);
12812
12813 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
12814 maxchar = _PyUnicode_FindMaxChar(str, start, end);
12815 else
12816 maxchar = writer->maxchar;
12817 len = end - start;
12818
12819 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
12820 return -1;
12821
12822 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12823 str, start, len);
12824 writer->pos += len;
12825 return 0;
12826}
12827
12828int
Victor Stinnere215d962012-10-06 23:03:36 +020012829_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12830{
12831 Py_UCS4 maxchar;
12832
12833 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12834 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12835 return -1;
12836 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12837 writer->pos += len;
12838 return 0;
12839}
12840
Victor Stinnerd3f08822012-05-29 12:57:52 +020012841PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012842_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012843{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012844 if (writer->pos == 0) {
12845 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012846 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012847 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012848 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012849 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12850 return writer->buffer;
12851 }
12852 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12853 PyObject *newbuffer;
12854 newbuffer = resize_compact(writer->buffer, writer->pos);
12855 if (newbuffer == NULL) {
12856 Py_DECREF(writer->buffer);
12857 return NULL;
12858 }
12859 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012860 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012861 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010012862 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012863}
12864
Victor Stinnerd3f08822012-05-29 12:57:52 +020012865void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012866_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012867{
12868 Py_CLEAR(writer->buffer);
12869}
12870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012872
12873PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012875\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012876Return a formatted version of S, using substitutions from args and kwargs.\n\
12877The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012878
Eric Smith27bbca62010-11-04 17:06:58 +000012879PyDoc_STRVAR(format_map__doc__,
12880 "S.format_map(mapping) -> str\n\
12881\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012882Return a formatted version of S, using substitutions from mapping.\n\
12883The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012884
Eric Smith4a7d76d2008-05-30 18:10:19 +000012885static PyObject *
12886unicode__format__(PyObject* self, PyObject* args)
12887{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012888 PyObject *format_spec;
12889 _PyUnicodeWriter writer;
12890 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012891
12892 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12893 return NULL;
12894
Victor Stinnerd3f08822012-05-29 12:57:52 +020012895 if (PyUnicode_READY(self) == -1)
12896 return NULL;
12897 _PyUnicodeWriter_Init(&writer, 0);
12898 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12899 self, format_spec, 0,
12900 PyUnicode_GET_LENGTH(format_spec));
12901 if (ret == -1) {
12902 _PyUnicodeWriter_Dealloc(&writer);
12903 return NULL;
12904 }
12905 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012906}
12907
Eric Smith8c663262007-08-25 02:26:07 +000012908PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012909 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012910\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012911Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012912
12913static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012914unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012916 Py_ssize_t size;
12917
12918 /* If it's a compact object, account for base structure +
12919 character data. */
12920 if (PyUnicode_IS_COMPACT_ASCII(v))
12921 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12922 else if (PyUnicode_IS_COMPACT(v))
12923 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012924 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 else {
12926 /* If it is a two-block object, account for base object, and
12927 for character block if present. */
12928 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012929 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012931 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 }
12933 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012934 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012935 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012937 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012938 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939
12940 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012941}
12942
12943PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012944 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012945
12946static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012947unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012948{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012949 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 if (!copy)
12951 return NULL;
12952 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012953}
12954
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012956 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012957 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012958 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12959 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012960 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12961 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012962 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012963 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12964 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12965 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12966 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12967 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012968 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012969 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12970 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12971 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012972 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012973 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12974 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12975 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012976 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012977 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012978 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012979 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012980 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12981 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12982 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12983 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12984 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12985 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12986 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12987 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12988 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12989 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12990 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12991 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12992 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12993 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012994 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012995 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012996 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012997 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012998 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012999 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013000 {"maketrans", (PyCFunction) unicode_maketrans,
13001 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013002 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013003#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013004 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013005 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006#endif
13007
Benjamin Peterson14339b62009-01-31 16:36:08 +000013008 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013009 {NULL, NULL}
13010};
13011
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013012static PyObject *
13013unicode_mod(PyObject *v, PyObject *w)
13014{
Brian Curtindfc80e32011-08-10 20:28:54 -050013015 if (!PyUnicode_Check(v))
13016 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013017 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013018}
13019
13020static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013021 0, /*nb_add*/
13022 0, /*nb_subtract*/
13023 0, /*nb_multiply*/
13024 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013025};
13026
Guido van Rossumd57fd912000-03-10 22:53:23 +000013027static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013028 (lenfunc) unicode_length, /* sq_length */
13029 PyUnicode_Concat, /* sq_concat */
13030 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13031 (ssizeargfunc) unicode_getitem, /* sq_item */
13032 0, /* sq_slice */
13033 0, /* sq_ass_item */
13034 0, /* sq_ass_slice */
13035 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013036};
13037
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013038static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013039unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013041 if (PyUnicode_READY(self) == -1)
13042 return NULL;
13043
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013044 if (PyIndex_Check(item)) {
13045 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013046 if (i == -1 && PyErr_Occurred())
13047 return NULL;
13048 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013049 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013050 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013051 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013052 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013053 PyObject *result;
13054 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013055 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013056 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013058 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013059 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013060 return NULL;
13061 }
13062
13063 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013064 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013066 slicelength == PyUnicode_GET_LENGTH(self)) {
13067 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013068 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013069 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013070 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013071 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013072 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013073 src_kind = PyUnicode_KIND(self);
13074 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013075 if (!PyUnicode_IS_ASCII(self)) {
13076 kind_limit = kind_maxchar_limit(src_kind);
13077 max_char = 0;
13078 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13079 ch = PyUnicode_READ(src_kind, src_data, cur);
13080 if (ch > max_char) {
13081 max_char = ch;
13082 if (max_char >= kind_limit)
13083 break;
13084 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013085 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013086 }
Victor Stinner55c99112011-10-13 01:17:06 +020013087 else
13088 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013089 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013090 if (result == NULL)
13091 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013092 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013093 dest_data = PyUnicode_DATA(result);
13094
13095 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013096 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13097 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013098 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013099 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013100 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013101 } else {
13102 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13103 return NULL;
13104 }
13105}
13106
13107static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013108 (lenfunc)unicode_length, /* mp_length */
13109 (binaryfunc)unicode_subscript, /* mp_subscript */
13110 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013111};
13112
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114/* Helpers for PyUnicode_Format() */
13115
Victor Stinnera47082312012-10-04 02:19:54 +020013116struct unicode_formatter_t {
13117 PyObject *args;
13118 int args_owned;
13119 Py_ssize_t arglen, argidx;
13120 PyObject *dict;
13121
13122 enum PyUnicode_Kind fmtkind;
13123 Py_ssize_t fmtcnt, fmtpos;
13124 void *fmtdata;
13125 PyObject *fmtstr;
13126
13127 _PyUnicodeWriter writer;
13128};
13129
13130struct unicode_format_arg_t {
13131 Py_UCS4 ch;
13132 int flags;
13133 Py_ssize_t width;
13134 int prec;
13135 int sign;
13136};
13137
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013139unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140{
Victor Stinnera47082312012-10-04 02:19:54 +020013141 Py_ssize_t argidx = ctx->argidx;
13142
13143 if (argidx < ctx->arglen) {
13144 ctx->argidx++;
13145 if (ctx->arglen < 0)
13146 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 else
Victor Stinnera47082312012-10-04 02:19:54 +020013148 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149 }
13150 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013151 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152 return NULL;
13153}
13154
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013155/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156
Victor Stinnera47082312012-10-04 02:19:54 +020013157/* Format a float into the writer if the writer is not NULL, or into *p_output
13158 otherwise.
13159
13160 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013161static int
Victor Stinnera47082312012-10-04 02:19:54 +020013162formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13163 PyObject **p_output,
13164 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013166 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013168 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013169 int prec;
13170 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013171
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172 x = PyFloat_AsDouble(v);
13173 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013174 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013175
Victor Stinnera47082312012-10-04 02:19:54 +020013176 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013179
Victor Stinnera47082312012-10-04 02:19:54 +020013180 if (arg->flags & F_ALT)
13181 dtoa_flags = Py_DTSF_ALT;
13182 else
13183 dtoa_flags = 0;
13184 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013185 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013186 return -1;
13187 len = strlen(p);
13188 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013189 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13190 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013191 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013192 }
Victor Stinner184252a2012-06-16 02:57:41 +020013193 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013194 writer->pos += len;
13195 }
13196 else
13197 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013198 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013199 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200}
13201
Victor Stinnerd0880d52012-04-27 23:40:13 +020013202/* formatlong() emulates the format codes d, u, o, x and X, and
13203 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13204 * Python's regular ints.
13205 * Return value: a new PyUnicodeObject*, or NULL if error.
13206 * The output string is of the form
13207 * "-"? ("0x" | "0X")? digit+
13208 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13209 * set in flags. The case of hex digits will be correct,
13210 * There will be at least prec digits, zero-filled on the left if
13211 * necessary to get that many.
13212 * val object to be converted
13213 * flags bitmask of format flags; only F_ALT is looked at
13214 * prec minimum number of digits; 0-fill on left if needed
13215 * type a character in [duoxX]; u acts the same as d
13216 *
13217 * CAUTION: o, x and X conversions on regular ints can never
13218 * produce a '-' sign, but can for Python's unbounded ints.
13219 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013220static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013221formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013222{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013223 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013224 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013225 Py_ssize_t i;
13226 int sign; /* 1 if '-', else 0 */
13227 int len; /* number of characters */
13228 Py_ssize_t llen;
13229 int numdigits; /* len == numnondigits + numdigits */
13230 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013231 int prec = arg->prec;
13232 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013233
Victor Stinnerd0880d52012-04-27 23:40:13 +020013234 /* Avoid exceeding SSIZE_T_MAX */
13235 if (prec > INT_MAX-3) {
13236 PyErr_SetString(PyExc_OverflowError,
13237 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013238 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013239 }
13240
13241 assert(PyLong_Check(val));
13242
13243 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013244 default:
13245 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013246 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013247 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013248 case 'u':
13249 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013250 if (PyBool_Check(val))
13251 result = PyNumber_ToBase(val, 10);
13252 else
13253 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013254 break;
13255 case 'o':
13256 numnondigits = 2;
13257 result = PyNumber_ToBase(val, 8);
13258 break;
13259 case 'x':
13260 case 'X':
13261 numnondigits = 2;
13262 result = PyNumber_ToBase(val, 16);
13263 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013264 }
13265 if (!result)
13266 return NULL;
13267
13268 assert(unicode_modifiable(result));
13269 assert(PyUnicode_IS_READY(result));
13270 assert(PyUnicode_IS_ASCII(result));
13271
13272 /* To modify the string in-place, there can only be one reference. */
13273 if (Py_REFCNT(result) != 1) {
13274 PyErr_BadInternalCall();
13275 return NULL;
13276 }
13277 buf = PyUnicode_DATA(result);
13278 llen = PyUnicode_GET_LENGTH(result);
13279 if (llen > INT_MAX) {
13280 PyErr_SetString(PyExc_ValueError,
13281 "string too large in _PyBytes_FormatLong");
13282 return NULL;
13283 }
13284 len = (int)llen;
13285 sign = buf[0] == '-';
13286 numnondigits += sign;
13287 numdigits = len - numnondigits;
13288 assert(numdigits > 0);
13289
13290 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013291 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013292 (type == 'o' || type == 'x' || type == 'X'))) {
13293 assert(buf[sign] == '0');
13294 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13295 buf[sign+1] == 'o');
13296 numnondigits -= 2;
13297 buf += 2;
13298 len -= 2;
13299 if (sign)
13300 buf[0] = '-';
13301 assert(len == numnondigits + numdigits);
13302 assert(numdigits > 0);
13303 }
13304
13305 /* Fill with leading zeroes to meet minimum width. */
13306 if (prec > numdigits) {
13307 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13308 numnondigits + prec);
13309 char *b1;
13310 if (!r1) {
13311 Py_DECREF(result);
13312 return NULL;
13313 }
13314 b1 = PyBytes_AS_STRING(r1);
13315 for (i = 0; i < numnondigits; ++i)
13316 *b1++ = *buf++;
13317 for (i = 0; i < prec - numdigits; i++)
13318 *b1++ = '0';
13319 for (i = 0; i < numdigits; i++)
13320 *b1++ = *buf++;
13321 *b1 = '\0';
13322 Py_DECREF(result);
13323 result = r1;
13324 buf = PyBytes_AS_STRING(result);
13325 len = numnondigits + prec;
13326 }
13327
13328 /* Fix up case for hex conversions. */
13329 if (type == 'X') {
13330 /* Need to convert all lower case letters to upper case.
13331 and need to convert 0x to 0X (and -0x to -0X). */
13332 for (i = 0; i < len; i++)
13333 if (buf[i] >= 'a' && buf[i] <= 'x')
13334 buf[i] -= 'a'-'A';
13335 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013336 if (!PyUnicode_Check(result)
13337 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013338 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013339 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013340 Py_DECREF(result);
13341 result = unicode;
13342 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013343 else if (len != PyUnicode_GET_LENGTH(result)) {
13344 if (PyUnicode_Resize(&result, len) < 0)
13345 Py_CLEAR(result);
13346 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013347 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013348}
13349
Victor Stinner621ef3d2012-10-02 00:33:47 +020013350/* Format an integer.
13351 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013352 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013353 * -1 and raise an exception on error */
13354static int
Victor Stinnera47082312012-10-04 02:19:54 +020013355mainformatlong(PyObject *v,
13356 struct unicode_format_arg_t *arg,
13357 PyObject **p_output,
13358 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013359{
13360 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013361 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013362
13363 if (!PyNumber_Check(v))
13364 goto wrongtype;
13365
13366 if (!PyLong_Check(v)) {
13367 iobj = PyNumber_Long(v);
13368 if (iobj == NULL) {
13369 if (PyErr_ExceptionMatches(PyExc_TypeError))
13370 goto wrongtype;
13371 return -1;
13372 }
13373 assert(PyLong_Check(iobj));
13374 }
13375 else {
13376 iobj = v;
13377 Py_INCREF(iobj);
13378 }
13379
13380 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013381 && arg->width == -1 && arg->prec == -1
13382 && !(arg->flags & (F_SIGN | F_BLANK))
13383 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013384 {
13385 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013386 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013387 int base;
13388
Victor Stinnera47082312012-10-04 02:19:54 +020013389 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013390 {
13391 default:
13392 assert(0 && "'type' not in [diuoxX]");
13393 case 'd':
13394 case 'i':
13395 case 'u':
13396 base = 10;
13397 break;
13398 case 'o':
13399 base = 8;
13400 break;
13401 case 'x':
13402 case 'X':
13403 base = 16;
13404 break;
13405 }
13406
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013407 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13408 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013409 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013410 }
13411 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013412 return 1;
13413 }
13414
Victor Stinnera47082312012-10-04 02:19:54 +020013415 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013416 Py_DECREF(iobj);
13417 if (res == NULL)
13418 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013419 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013420 return 0;
13421
13422wrongtype:
13423 PyErr_Format(PyExc_TypeError,
13424 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013425 "not %.200s",
13426 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013427 return -1;
13428}
13429
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013430static Py_UCS4
13431formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013433 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013434 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013435 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013436 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013437 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 goto onError;
13439 }
13440 else {
13441 /* Integer input truncated to a character */
13442 long x;
13443 x = PyLong_AsLong(v);
13444 if (x == -1 && PyErr_Occurred())
13445 goto onError;
13446
Victor Stinner8faf8212011-12-08 22:14:11 +010013447 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 PyErr_SetString(PyExc_OverflowError,
13449 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013450 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 }
13452
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013453 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013454 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013455
Benjamin Peterson29060642009-01-31 22:14:21 +000013456 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013457 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013459 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013460}
13461
Victor Stinnera47082312012-10-04 02:19:54 +020013462/* Parse options of an argument: flags, width, precision.
13463 Handle also "%(name)" syntax.
13464
13465 Return 0 if the argument has been formatted into arg->str.
13466 Return 1 if the argument has been written into ctx->writer,
13467 Raise an exception and return -1 on error. */
13468static int
13469unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13470 struct unicode_format_arg_t *arg)
13471{
13472#define FORMAT_READ(ctx) \
13473 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13474
13475 PyObject *v;
13476
Victor Stinnera47082312012-10-04 02:19:54 +020013477 if (arg->ch == '(') {
13478 /* Get argument value from a dictionary. Example: "%(name)s". */
13479 Py_ssize_t keystart;
13480 Py_ssize_t keylen;
13481 PyObject *key;
13482 int pcount = 1;
13483
13484 if (ctx->dict == NULL) {
13485 PyErr_SetString(PyExc_TypeError,
13486 "format requires a mapping");
13487 return -1;
13488 }
13489 ++ctx->fmtpos;
13490 --ctx->fmtcnt;
13491 keystart = ctx->fmtpos;
13492 /* Skip over balanced parentheses */
13493 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13494 arg->ch = FORMAT_READ(ctx);
13495 if (arg->ch == ')')
13496 --pcount;
13497 else if (arg->ch == '(')
13498 ++pcount;
13499 ctx->fmtpos++;
13500 }
13501 keylen = ctx->fmtpos - keystart - 1;
13502 if (ctx->fmtcnt < 0 || pcount > 0) {
13503 PyErr_SetString(PyExc_ValueError,
13504 "incomplete format key");
13505 return -1;
13506 }
13507 key = PyUnicode_Substring(ctx->fmtstr,
13508 keystart, keystart + keylen);
13509 if (key == NULL)
13510 return -1;
13511 if (ctx->args_owned) {
13512 Py_DECREF(ctx->args);
13513 ctx->args_owned = 0;
13514 }
13515 ctx->args = PyObject_GetItem(ctx->dict, key);
13516 Py_DECREF(key);
13517 if (ctx->args == NULL)
13518 return -1;
13519 ctx->args_owned = 1;
13520 ctx->arglen = -1;
13521 ctx->argidx = -2;
13522 }
13523
13524 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020013525 while (--ctx->fmtcnt >= 0) {
13526 arg->ch = FORMAT_READ(ctx);
13527 ctx->fmtpos++;
13528 switch (arg->ch) {
13529 case '-': arg->flags |= F_LJUST; continue;
13530 case '+': arg->flags |= F_SIGN; continue;
13531 case ' ': arg->flags |= F_BLANK; continue;
13532 case '#': arg->flags |= F_ALT; continue;
13533 case '0': arg->flags |= F_ZERO; continue;
13534 }
13535 break;
13536 }
13537
13538 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020013539 if (arg->ch == '*') {
13540 v = unicode_format_getnextarg(ctx);
13541 if (v == NULL)
13542 return -1;
13543 if (!PyLong_Check(v)) {
13544 PyErr_SetString(PyExc_TypeError,
13545 "* wants int");
13546 return -1;
13547 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013548 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013549 if (arg->width == -1 && PyErr_Occurred())
13550 return -1;
13551 if (arg->width < 0) {
13552 arg->flags |= F_LJUST;
13553 arg->width = -arg->width;
13554 }
13555 if (--ctx->fmtcnt >= 0) {
13556 arg->ch = FORMAT_READ(ctx);
13557 ctx->fmtpos++;
13558 }
13559 }
13560 else if (arg->ch >= '0' && arg->ch <= '9') {
13561 arg->width = arg->ch - '0';
13562 while (--ctx->fmtcnt >= 0) {
13563 arg->ch = FORMAT_READ(ctx);
13564 ctx->fmtpos++;
13565 if (arg->ch < '0' || arg->ch > '9')
13566 break;
13567 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13568 mixing signed and unsigned comparison. Since arg->ch is between
13569 '0' and '9', casting to int is safe. */
13570 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13571 PyErr_SetString(PyExc_ValueError,
13572 "width too big");
13573 return -1;
13574 }
13575 arg->width = arg->width*10 + (arg->ch - '0');
13576 }
13577 }
13578
13579 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020013580 if (arg->ch == '.') {
13581 arg->prec = 0;
13582 if (--ctx->fmtcnt >= 0) {
13583 arg->ch = FORMAT_READ(ctx);
13584 ctx->fmtpos++;
13585 }
13586 if (arg->ch == '*') {
13587 v = unicode_format_getnextarg(ctx);
13588 if (v == NULL)
13589 return -1;
13590 if (!PyLong_Check(v)) {
13591 PyErr_SetString(PyExc_TypeError,
13592 "* wants int");
13593 return -1;
13594 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020013595 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020013596 if (arg->prec == -1 && PyErr_Occurred())
13597 return -1;
13598 if (arg->prec < 0)
13599 arg->prec = 0;
13600 if (--ctx->fmtcnt >= 0) {
13601 arg->ch = FORMAT_READ(ctx);
13602 ctx->fmtpos++;
13603 }
13604 }
13605 else if (arg->ch >= '0' && arg->ch <= '9') {
13606 arg->prec = arg->ch - '0';
13607 while (--ctx->fmtcnt >= 0) {
13608 arg->ch = FORMAT_READ(ctx);
13609 ctx->fmtpos++;
13610 if (arg->ch < '0' || arg->ch > '9')
13611 break;
13612 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13613 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013614 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013615 return -1;
13616 }
13617 arg->prec = arg->prec*10 + (arg->ch - '0');
13618 }
13619 }
13620 }
13621
13622 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13623 if (ctx->fmtcnt >= 0) {
13624 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13625 if (--ctx->fmtcnt >= 0) {
13626 arg->ch = FORMAT_READ(ctx);
13627 ctx->fmtpos++;
13628 }
13629 }
13630 }
13631 if (ctx->fmtcnt < 0) {
13632 PyErr_SetString(PyExc_ValueError,
13633 "incomplete format");
13634 return -1;
13635 }
13636 return 0;
13637
13638#undef FORMAT_READ
13639}
13640
13641/* Format one argument. Supported conversion specifiers:
13642
13643 - "s", "r", "a": any type
13644 - "i", "d", "u", "o", "x", "X": int
13645 - "e", "E", "f", "F", "g", "G": float
13646 - "c": int or str (1 character)
13647
Victor Stinner8dbd4212012-12-04 09:30:24 +010013648 When possible, the output is written directly into the Unicode writer
13649 (ctx->writer). A string is created when padding is required.
13650
Victor Stinnera47082312012-10-04 02:19:54 +020013651 Return 0 if the argument has been formatted into *p_str,
13652 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010013653 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020013654static int
13655unicode_format_arg_format(struct unicode_formatter_t *ctx,
13656 struct unicode_format_arg_t *arg,
13657 PyObject **p_str)
13658{
13659 PyObject *v;
13660 _PyUnicodeWriter *writer = &ctx->writer;
13661
13662 if (ctx->fmtcnt == 0)
13663 ctx->writer.overallocate = 0;
13664
13665 if (arg->ch == '%') {
13666 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13667 return -1;
13668 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13669 writer->pos += 1;
13670 return 1;
13671 }
13672
13673 v = unicode_format_getnextarg(ctx);
13674 if (v == NULL)
13675 return -1;
13676
Victor Stinnera47082312012-10-04 02:19:54 +020013677
13678 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020013679 case 's':
13680 case 'r':
13681 case 'a':
13682 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13683 /* Fast path */
13684 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13685 return -1;
13686 return 1;
13687 }
13688
13689 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13690 *p_str = v;
13691 Py_INCREF(*p_str);
13692 }
13693 else {
13694 if (arg->ch == 's')
13695 *p_str = PyObject_Str(v);
13696 else if (arg->ch == 'r')
13697 *p_str = PyObject_Repr(v);
13698 else
13699 *p_str = PyObject_ASCII(v);
13700 }
13701 break;
13702
13703 case 'i':
13704 case 'd':
13705 case 'u':
13706 case 'o':
13707 case 'x':
13708 case 'X':
13709 {
13710 int ret = mainformatlong(v, arg, p_str, writer);
13711 if (ret != 0)
13712 return ret;
13713 arg->sign = 1;
13714 break;
13715 }
13716
13717 case 'e':
13718 case 'E':
13719 case 'f':
13720 case 'F':
13721 case 'g':
13722 case 'G':
13723 if (arg->width == -1 && arg->prec == -1
13724 && !(arg->flags & (F_SIGN | F_BLANK)))
13725 {
13726 /* Fast path */
13727 if (formatfloat(v, arg, NULL, writer) == -1)
13728 return -1;
13729 return 1;
13730 }
13731
13732 arg->sign = 1;
13733 if (formatfloat(v, arg, p_str, NULL) == -1)
13734 return -1;
13735 break;
13736
13737 case 'c':
13738 {
13739 Py_UCS4 ch = formatchar(v);
13740 if (ch == (Py_UCS4) -1)
13741 return -1;
13742 if (arg->width == -1 && arg->prec == -1) {
13743 /* Fast path */
13744 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13745 return -1;
13746 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13747 writer->pos += 1;
13748 return 1;
13749 }
13750 *p_str = PyUnicode_FromOrdinal(ch);
13751 break;
13752 }
13753
13754 default:
13755 PyErr_Format(PyExc_ValueError,
13756 "unsupported format character '%c' (0x%x) "
13757 "at index %zd",
13758 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13759 (int)arg->ch,
13760 ctx->fmtpos - 1);
13761 return -1;
13762 }
13763 if (*p_str == NULL)
13764 return -1;
13765 assert (PyUnicode_Check(*p_str));
13766 return 0;
13767}
13768
13769static int
13770unicode_format_arg_output(struct unicode_formatter_t *ctx,
13771 struct unicode_format_arg_t *arg,
13772 PyObject *str)
13773{
13774 Py_ssize_t len;
13775 enum PyUnicode_Kind kind;
13776 void *pbuf;
13777 Py_ssize_t pindex;
13778 Py_UCS4 signchar;
13779 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013780 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020013781 Py_ssize_t sublen;
13782 _PyUnicodeWriter *writer = &ctx->writer;
13783 Py_UCS4 fill;
13784
13785 fill = ' ';
13786 if (arg->sign && arg->flags & F_ZERO)
13787 fill = '0';
13788
13789 if (PyUnicode_READY(str) == -1)
13790 return -1;
13791
13792 len = PyUnicode_GET_LENGTH(str);
13793 if ((arg->width == -1 || arg->width <= len)
13794 && (arg->prec == -1 || arg->prec >= len)
13795 && !(arg->flags & (F_SIGN | F_BLANK)))
13796 {
13797 /* Fast path */
13798 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13799 return -1;
13800 return 0;
13801 }
13802
13803 /* Truncate the string for "s", "r" and "a" formats
13804 if the precision is set */
13805 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13806 if (arg->prec >= 0 && len > arg->prec)
13807 len = arg->prec;
13808 }
13809
13810 /* Adjust sign and width */
13811 kind = PyUnicode_KIND(str);
13812 pbuf = PyUnicode_DATA(str);
13813 pindex = 0;
13814 signchar = '\0';
13815 if (arg->sign) {
13816 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13817 if (ch == '-' || ch == '+') {
13818 signchar = ch;
13819 len--;
13820 pindex++;
13821 }
13822 else if (arg->flags & F_SIGN)
13823 signchar = '+';
13824 else if (arg->flags & F_BLANK)
13825 signchar = ' ';
13826 else
13827 arg->sign = 0;
13828 }
13829 if (arg->width < len)
13830 arg->width = len;
13831
13832 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013833 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020013834 if (!(arg->flags & F_LJUST)) {
13835 if (arg->sign) {
13836 if ((arg->width-1) > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013837 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020013838 }
13839 else {
13840 if (arg->width > len)
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013841 maxchar = MAX_MAXCHAR(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020013842 }
13843 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013844 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
13845 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13846 maxchar = MAX_MAXCHAR(maxchar, strmaxchar);
13847 }
13848
Victor Stinnera47082312012-10-04 02:19:54 +020013849 buflen = arg->width;
13850 if (arg->sign && len == arg->width)
13851 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020013852 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020013853 return -1;
13854
13855 /* Write the sign if needed */
13856 if (arg->sign) {
13857 if (fill != ' ') {
13858 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13859 writer->pos += 1;
13860 }
13861 if (arg->width > len)
13862 arg->width--;
13863 }
13864
13865 /* Write the numeric prefix for "x", "X" and "o" formats
13866 if the alternate form is used.
13867 For example, write "0x" for the "%#x" format. */
13868 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13869 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13870 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13871 if (fill != ' ') {
13872 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13873 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13874 writer->pos += 2;
13875 pindex += 2;
13876 }
13877 arg->width -= 2;
13878 if (arg->width < 0)
13879 arg->width = 0;
13880 len -= 2;
13881 }
13882
13883 /* Pad left with the fill character if needed */
13884 if (arg->width > len && !(arg->flags & F_LJUST)) {
13885 sublen = arg->width - len;
13886 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13887 writer->pos += sublen;
13888 arg->width = len;
13889 }
13890
13891 /* If padding with spaces: write sign if needed and/or numeric prefix if
13892 the alternate form is used */
13893 if (fill == ' ') {
13894 if (arg->sign) {
13895 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13896 writer->pos += 1;
13897 }
13898 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13899 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13900 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13901 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13902 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13903 writer->pos += 2;
13904 pindex += 2;
13905 }
13906 }
13907
13908 /* Write characters */
13909 if (len) {
13910 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13911 str, pindex, len);
13912 writer->pos += len;
13913 }
13914
13915 /* Pad right with the fill character if needed */
13916 if (arg->width > len) {
13917 sublen = arg->width - len;
13918 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13919 writer->pos += sublen;
13920 }
13921 return 0;
13922}
13923
13924/* Helper of PyUnicode_Format(): format one arg.
13925 Return 0 on success, raise an exception and return -1 on error. */
13926static int
13927unicode_format_arg(struct unicode_formatter_t *ctx)
13928{
13929 struct unicode_format_arg_t arg;
13930 PyObject *str;
13931 int ret;
13932
Victor Stinner8dbd4212012-12-04 09:30:24 +010013933 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
13934 arg.flags = 0;
13935 arg.width = -1;
13936 arg.prec = -1;
13937 arg.sign = 0;
13938 str = NULL;
13939
Victor Stinnera47082312012-10-04 02:19:54 +020013940 ret = unicode_format_arg_parse(ctx, &arg);
13941 if (ret == -1)
13942 return -1;
13943
13944 ret = unicode_format_arg_format(ctx, &arg, &str);
13945 if (ret == -1)
13946 return -1;
13947
13948 if (ret != 1) {
13949 ret = unicode_format_arg_output(ctx, &arg, str);
13950 Py_DECREF(str);
13951 if (ret == -1)
13952 return -1;
13953 }
13954
13955 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13956 PyErr_SetString(PyExc_TypeError,
13957 "not all arguments converted during string formatting");
13958 return -1;
13959 }
13960 return 0;
13961}
13962
Alexander Belopolsky40018472011-02-26 01:02:56 +000013963PyObject *
13964PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013965{
Victor Stinnera47082312012-10-04 02:19:54 +020013966 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000013967
Guido van Rossumd57fd912000-03-10 22:53:23 +000013968 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013969 PyErr_BadInternalCall();
13970 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013971 }
Victor Stinnera47082312012-10-04 02:19:54 +020013972
13973 ctx.fmtstr = PyUnicode_FromObject(format);
13974 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013975 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020013976 if (PyUnicode_READY(ctx.fmtstr) == -1) {
13977 Py_DECREF(ctx.fmtstr);
13978 return NULL;
13979 }
13980 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13981 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13982 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13983 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013984
Victor Stinnera47082312012-10-04 02:19:54 +020013985 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013986
Guido van Rossumd57fd912000-03-10 22:53:23 +000013987 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020013988 ctx.arglen = PyTuple_Size(args);
13989 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013990 }
13991 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013992 ctx.arglen = -1;
13993 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013994 }
Victor Stinnera47082312012-10-04 02:19:54 +020013995 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013996 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020013997 ctx.dict = args;
13998 else
13999 ctx.dict = NULL;
14000 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014001
Victor Stinnera47082312012-10-04 02:19:54 +020014002 while (--ctx.fmtcnt >= 0) {
14003 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014004 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014005
14006 nonfmtpos = ctx.fmtpos++;
14007 while (ctx.fmtcnt >= 0 &&
14008 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14009 ctx.fmtpos++;
14010 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014011 }
Victor Stinnera47082312012-10-04 02:19:54 +020014012 if (ctx.fmtcnt < 0) {
14013 ctx.fmtpos--;
14014 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014015 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014016
Victor Stinnercfc4c132013-04-03 01:48:39 +020014017 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14018 nonfmtpos, ctx.fmtpos) < 0)
14019 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014020 }
14021 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014022 ctx.fmtpos++;
14023 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014024 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014025 }
14026 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014027
Victor Stinnera47082312012-10-04 02:19:54 +020014028 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014029 PyErr_SetString(PyExc_TypeError,
14030 "not all arguments converted during string formatting");
14031 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014032 }
14033
Victor Stinnera47082312012-10-04 02:19:54 +020014034 if (ctx.args_owned) {
14035 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014036 }
Victor Stinnera47082312012-10-04 02:19:54 +020014037 Py_DECREF(ctx.fmtstr);
14038 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014039
Benjamin Peterson29060642009-01-31 22:14:21 +000014040 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014041 Py_DECREF(ctx.fmtstr);
14042 _PyUnicodeWriter_Dealloc(&ctx.writer);
14043 if (ctx.args_owned) {
14044 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014045 }
14046 return NULL;
14047}
14048
Jeremy Hylton938ace62002-07-17 16:30:39 +000014049static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014050unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14051
Tim Peters6d6c1a32001-08-02 04:15:00 +000014052static PyObject *
14053unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14054{
Benjamin Peterson29060642009-01-31 22:14:21 +000014055 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014056 static char *kwlist[] = {"object", "encoding", "errors", 0};
14057 char *encoding = NULL;
14058 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014059
Benjamin Peterson14339b62009-01-31 16:36:08 +000014060 if (type != &PyUnicode_Type)
14061 return unicode_subtype_new(type, args, kwds);
14062 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014063 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014064 return NULL;
14065 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014066 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014067 if (encoding == NULL && errors == NULL)
14068 return PyObject_Str(x);
14069 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014070 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014071}
14072
Guido van Rossume023fe02001-08-30 03:12:59 +000014073static PyObject *
14074unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14075{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014076 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014077 Py_ssize_t length, char_size;
14078 int share_wstr, share_utf8;
14079 unsigned int kind;
14080 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014081
Benjamin Peterson14339b62009-01-31 16:36:08 +000014082 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014083
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014084 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014085 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014086 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014087 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014088 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014089 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014090 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014091 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014092
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014093 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014094 if (self == NULL) {
14095 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014096 return NULL;
14097 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014098 kind = PyUnicode_KIND(unicode);
14099 length = PyUnicode_GET_LENGTH(unicode);
14100
14101 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014102#ifdef Py_DEBUG
14103 _PyUnicode_HASH(self) = -1;
14104#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014105 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014106#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014107 _PyUnicode_STATE(self).interned = 0;
14108 _PyUnicode_STATE(self).kind = kind;
14109 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014110 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014111 _PyUnicode_STATE(self).ready = 1;
14112 _PyUnicode_WSTR(self) = NULL;
14113 _PyUnicode_UTF8_LENGTH(self) = 0;
14114 _PyUnicode_UTF8(self) = NULL;
14115 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014116 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014117
14118 share_utf8 = 0;
14119 share_wstr = 0;
14120 if (kind == PyUnicode_1BYTE_KIND) {
14121 char_size = 1;
14122 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14123 share_utf8 = 1;
14124 }
14125 else if (kind == PyUnicode_2BYTE_KIND) {
14126 char_size = 2;
14127 if (sizeof(wchar_t) == 2)
14128 share_wstr = 1;
14129 }
14130 else {
14131 assert(kind == PyUnicode_4BYTE_KIND);
14132 char_size = 4;
14133 if (sizeof(wchar_t) == 4)
14134 share_wstr = 1;
14135 }
14136
14137 /* Ensure we won't overflow the length. */
14138 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14139 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014140 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014141 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014142 data = PyObject_MALLOC((length + 1) * char_size);
14143 if (data == NULL) {
14144 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014145 goto onError;
14146 }
14147
Victor Stinnerc3c74152011-10-02 20:39:55 +020014148 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014149 if (share_utf8) {
14150 _PyUnicode_UTF8_LENGTH(self) = length;
14151 _PyUnicode_UTF8(self) = data;
14152 }
14153 if (share_wstr) {
14154 _PyUnicode_WSTR_LENGTH(self) = length;
14155 _PyUnicode_WSTR(self) = (wchar_t *)data;
14156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014157
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014158 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014159 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014160 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014161#ifdef Py_DEBUG
14162 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14163#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014164 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014165 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014166
14167onError:
14168 Py_DECREF(unicode);
14169 Py_DECREF(self);
14170 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014171}
14172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014173PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014174"str(object='') -> str\n\
14175str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014176\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014177Create a new string object from the given object. If encoding or\n\
14178errors is specified, then the object must expose a data buffer\n\
14179that will be decoded using the given encoding and error handler.\n\
14180Otherwise, returns the result of object.__str__() (if defined)\n\
14181or repr(object).\n\
14182encoding defaults to sys.getdefaultencoding().\n\
14183errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014184
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014185static PyObject *unicode_iter(PyObject *seq);
14186
Guido van Rossumd57fd912000-03-10 22:53:23 +000014187PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014188 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014189 "str", /* tp_name */
14190 sizeof(PyUnicodeObject), /* tp_size */
14191 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014192 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014193 (destructor)unicode_dealloc, /* tp_dealloc */
14194 0, /* tp_print */
14195 0, /* tp_getattr */
14196 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014197 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014198 unicode_repr, /* tp_repr */
14199 &unicode_as_number, /* tp_as_number */
14200 &unicode_as_sequence, /* tp_as_sequence */
14201 &unicode_as_mapping, /* tp_as_mapping */
14202 (hashfunc) unicode_hash, /* tp_hash*/
14203 0, /* tp_call*/
14204 (reprfunc) unicode_str, /* tp_str */
14205 PyObject_GenericGetAttr, /* tp_getattro */
14206 0, /* tp_setattro */
14207 0, /* tp_as_buffer */
14208 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014209 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014210 unicode_doc, /* tp_doc */
14211 0, /* tp_traverse */
14212 0, /* tp_clear */
14213 PyUnicode_RichCompare, /* tp_richcompare */
14214 0, /* tp_weaklistoffset */
14215 unicode_iter, /* tp_iter */
14216 0, /* tp_iternext */
14217 unicode_methods, /* tp_methods */
14218 0, /* tp_members */
14219 0, /* tp_getset */
14220 &PyBaseObject_Type, /* tp_base */
14221 0, /* tp_dict */
14222 0, /* tp_descr_get */
14223 0, /* tp_descr_set */
14224 0, /* tp_dictoffset */
14225 0, /* tp_init */
14226 0, /* tp_alloc */
14227 unicode_new, /* tp_new */
14228 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014229};
14230
14231/* Initialize the Unicode implementation */
14232
Victor Stinner3a50e702011-10-18 21:21:00 +020014233int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014234{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014235 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014236 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014237 0x000A, /* LINE FEED */
14238 0x000D, /* CARRIAGE RETURN */
14239 0x001C, /* FILE SEPARATOR */
14240 0x001D, /* GROUP SEPARATOR */
14241 0x001E, /* RECORD SEPARATOR */
14242 0x0085, /* NEXT LINE */
14243 0x2028, /* LINE SEPARATOR */
14244 0x2029, /* PARAGRAPH SEPARATOR */
14245 };
14246
Fred Drakee4315f52000-05-09 19:53:39 +000014247 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014248 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014249 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014250 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014251 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014252
Guido van Rossumcacfc072002-05-24 19:01:59 +000014253 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014254 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014255
14256 /* initialize the linebreak bloom filter */
14257 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014258 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014259 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014260
14261 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014262
Benjamin Petersonc4311282012-10-30 23:21:10 -040014263 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14264 Py_FatalError("Can't initialize field name iterator type");
14265
14266 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14267 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014268
Victor Stinner3a50e702011-10-18 21:21:00 +020014269#ifdef HAVE_MBCS
14270 winver.dwOSVersionInfoSize = sizeof(winver);
14271 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14272 PyErr_SetFromWindowsErr(0);
14273 return -1;
14274 }
14275#endif
14276 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014277}
14278
14279/* Finalize the Unicode implementation */
14280
Christian Heimesa156e092008-02-16 07:38:31 +000014281int
14282PyUnicode_ClearFreeList(void)
14283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014284 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014285}
14286
Guido van Rossumd57fd912000-03-10 22:53:23 +000014287void
Thomas Wouters78890102000-07-22 19:25:51 +000014288_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014289{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014290 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014291
Serhiy Storchaka05997252013-01-26 12:14:02 +020014292 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014293
Serhiy Storchaka05997252013-01-26 12:14:02 +020014294 for (i = 0; i < 256; i++)
14295 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014296 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014297 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014298}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014299
Walter Dörwald16807132007-05-25 13:52:07 +000014300void
14301PyUnicode_InternInPlace(PyObject **p)
14302{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014303 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014304 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014305#ifdef Py_DEBUG
14306 assert(s != NULL);
14307 assert(_PyUnicode_CHECK(s));
14308#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014309 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014310 return;
14311#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014312 /* If it's a subclass, we don't really know what putting
14313 it in the interned dict might do. */
14314 if (!PyUnicode_CheckExact(s))
14315 return;
14316 if (PyUnicode_CHECK_INTERNED(s))
14317 return;
14318 if (interned == NULL) {
14319 interned = PyDict_New();
14320 if (interned == NULL) {
14321 PyErr_Clear(); /* Don't leave an exception */
14322 return;
14323 }
14324 }
14325 /* It might be that the GetItem call fails even
14326 though the key is present in the dictionary,
14327 namely when this happens during a stack overflow. */
14328 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014329 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014330 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014331
Benjamin Peterson29060642009-01-31 22:14:21 +000014332 if (t) {
14333 Py_INCREF(t);
14334 Py_DECREF(*p);
14335 *p = t;
14336 return;
14337 }
Walter Dörwald16807132007-05-25 13:52:07 +000014338
Benjamin Peterson14339b62009-01-31 16:36:08 +000014339 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014340 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014341 PyErr_Clear();
14342 PyThreadState_GET()->recursion_critical = 0;
14343 return;
14344 }
14345 PyThreadState_GET()->recursion_critical = 0;
14346 /* The two references in interned are not counted by refcnt.
14347 The deallocator will take care of this */
14348 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014349 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014350}
14351
14352void
14353PyUnicode_InternImmortal(PyObject **p)
14354{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014355 PyUnicode_InternInPlace(p);
14356 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014357 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014358 Py_INCREF(*p);
14359 }
Walter Dörwald16807132007-05-25 13:52:07 +000014360}
14361
14362PyObject *
14363PyUnicode_InternFromString(const char *cp)
14364{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014365 PyObject *s = PyUnicode_FromString(cp);
14366 if (s == NULL)
14367 return NULL;
14368 PyUnicode_InternInPlace(&s);
14369 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014370}
14371
Alexander Belopolsky40018472011-02-26 01:02:56 +000014372void
14373_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014374{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014375 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014376 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014377 Py_ssize_t i, n;
14378 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014379
Benjamin Peterson14339b62009-01-31 16:36:08 +000014380 if (interned == NULL || !PyDict_Check(interned))
14381 return;
14382 keys = PyDict_Keys(interned);
14383 if (keys == NULL || !PyList_Check(keys)) {
14384 PyErr_Clear();
14385 return;
14386 }
Walter Dörwald16807132007-05-25 13:52:07 +000014387
Benjamin Peterson14339b62009-01-31 16:36:08 +000014388 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14389 detector, interned unicode strings are not forcibly deallocated;
14390 rather, we give them their stolen references back, and then clear
14391 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014392
Benjamin Peterson14339b62009-01-31 16:36:08 +000014393 n = PyList_GET_SIZE(keys);
14394 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014395 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014396 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014397 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014398 if (PyUnicode_READY(s) == -1) {
14399 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014400 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014402 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014403 case SSTATE_NOT_INTERNED:
14404 /* XXX Shouldn't happen */
14405 break;
14406 case SSTATE_INTERNED_IMMORTAL:
14407 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014408 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014409 break;
14410 case SSTATE_INTERNED_MORTAL:
14411 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014412 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014413 break;
14414 default:
14415 Py_FatalError("Inconsistent interned string state.");
14416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014417 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014418 }
14419 fprintf(stderr, "total size of all interned strings: "
14420 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14421 "mortal/immortal\n", mortal_size, immortal_size);
14422 Py_DECREF(keys);
14423 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014424 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014425}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014426
14427
14428/********************* Unicode Iterator **************************/
14429
14430typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014431 PyObject_HEAD
14432 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014433 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014434} unicodeiterobject;
14435
14436static void
14437unicodeiter_dealloc(unicodeiterobject *it)
14438{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014439 _PyObject_GC_UNTRACK(it);
14440 Py_XDECREF(it->it_seq);
14441 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014442}
14443
14444static int
14445unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14446{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014447 Py_VISIT(it->it_seq);
14448 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014449}
14450
14451static PyObject *
14452unicodeiter_next(unicodeiterobject *it)
14453{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014454 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014455
Benjamin Peterson14339b62009-01-31 16:36:08 +000014456 assert(it != NULL);
14457 seq = it->it_seq;
14458 if (seq == NULL)
14459 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014460 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014462 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14463 int kind = PyUnicode_KIND(seq);
14464 void *data = PyUnicode_DATA(seq);
14465 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14466 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014467 if (item != NULL)
14468 ++it->it_index;
14469 return item;
14470 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014471
Benjamin Peterson14339b62009-01-31 16:36:08 +000014472 Py_DECREF(seq);
14473 it->it_seq = NULL;
14474 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014475}
14476
14477static PyObject *
14478unicodeiter_len(unicodeiterobject *it)
14479{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014480 Py_ssize_t len = 0;
14481 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014482 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014483 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014484}
14485
14486PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14487
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014488static PyObject *
14489unicodeiter_reduce(unicodeiterobject *it)
14490{
14491 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014492 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014493 it->it_seq, it->it_index);
14494 } else {
14495 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14496 if (u == NULL)
14497 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014498 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014499 }
14500}
14501
14502PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14503
14504static PyObject *
14505unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14506{
14507 Py_ssize_t index = PyLong_AsSsize_t(state);
14508 if (index == -1 && PyErr_Occurred())
14509 return NULL;
14510 if (index < 0)
14511 index = 0;
14512 it->it_index = index;
14513 Py_RETURN_NONE;
14514}
14515
14516PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14517
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014518static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014519 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014520 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014521 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14522 reduce_doc},
14523 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14524 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014525 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014526};
14527
14528PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014529 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14530 "str_iterator", /* tp_name */
14531 sizeof(unicodeiterobject), /* tp_basicsize */
14532 0, /* tp_itemsize */
14533 /* methods */
14534 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14535 0, /* tp_print */
14536 0, /* tp_getattr */
14537 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014538 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014539 0, /* tp_repr */
14540 0, /* tp_as_number */
14541 0, /* tp_as_sequence */
14542 0, /* tp_as_mapping */
14543 0, /* tp_hash */
14544 0, /* tp_call */
14545 0, /* tp_str */
14546 PyObject_GenericGetAttr, /* tp_getattro */
14547 0, /* tp_setattro */
14548 0, /* tp_as_buffer */
14549 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14550 0, /* tp_doc */
14551 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14552 0, /* tp_clear */
14553 0, /* tp_richcompare */
14554 0, /* tp_weaklistoffset */
14555 PyObject_SelfIter, /* tp_iter */
14556 (iternextfunc)unicodeiter_next, /* tp_iternext */
14557 unicodeiter_methods, /* tp_methods */
14558 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014559};
14560
14561static PyObject *
14562unicode_iter(PyObject *seq)
14563{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014564 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014565
Benjamin Peterson14339b62009-01-31 16:36:08 +000014566 if (!PyUnicode_Check(seq)) {
14567 PyErr_BadInternalCall();
14568 return NULL;
14569 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014570 if (PyUnicode_READY(seq) == -1)
14571 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014572 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14573 if (it == NULL)
14574 return NULL;
14575 it->it_index = 0;
14576 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014577 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014578 _PyObject_GC_TRACK(it);
14579 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014580}
14581
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014582
14583size_t
14584Py_UNICODE_strlen(const Py_UNICODE *u)
14585{
14586 int res = 0;
14587 while(*u++)
14588 res++;
14589 return res;
14590}
14591
14592Py_UNICODE*
14593Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14594{
14595 Py_UNICODE *u = s1;
14596 while ((*u++ = *s2++));
14597 return s1;
14598}
14599
14600Py_UNICODE*
14601Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14602{
14603 Py_UNICODE *u = s1;
14604 while ((*u++ = *s2++))
14605 if (n-- == 0)
14606 break;
14607 return s1;
14608}
14609
14610Py_UNICODE*
14611Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14612{
14613 Py_UNICODE *u1 = s1;
14614 u1 += Py_UNICODE_strlen(u1);
14615 Py_UNICODE_strcpy(u1, s2);
14616 return s1;
14617}
14618
14619int
14620Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14621{
14622 while (*s1 && *s2 && *s1 == *s2)
14623 s1++, s2++;
14624 if (*s1 && *s2)
14625 return (*s1 < *s2) ? -1 : +1;
14626 if (*s1)
14627 return 1;
14628 if (*s2)
14629 return -1;
14630 return 0;
14631}
14632
14633int
14634Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14635{
14636 register Py_UNICODE u1, u2;
14637 for (; n != 0; n--) {
14638 u1 = *s1;
14639 u2 = *s2;
14640 if (u1 != u2)
14641 return (u1 < u2) ? -1 : +1;
14642 if (u1 == '\0')
14643 return 0;
14644 s1++;
14645 s2++;
14646 }
14647 return 0;
14648}
14649
14650Py_UNICODE*
14651Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14652{
14653 const Py_UNICODE *p;
14654 for (p = s; *p; p++)
14655 if (*p == c)
14656 return (Py_UNICODE*)p;
14657 return NULL;
14658}
14659
14660Py_UNICODE*
14661Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14662{
14663 const Py_UNICODE *p;
14664 p = s + Py_UNICODE_strlen(s);
14665 while (p != s) {
14666 p--;
14667 if (*p == c)
14668 return (Py_UNICODE*)p;
14669 }
14670 return NULL;
14671}
Victor Stinner331ea922010-08-10 16:37:20 +000014672
Victor Stinner71133ff2010-09-01 23:43:53 +000014673Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014674PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014675{
Victor Stinner577db2c2011-10-11 22:12:48 +020014676 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014677 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014679 if (!PyUnicode_Check(unicode)) {
14680 PyErr_BadArgument();
14681 return NULL;
14682 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014683 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014684 if (u == NULL)
14685 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014686 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014687 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014688 PyErr_NoMemory();
14689 return NULL;
14690 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014691 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014692 size *= sizeof(Py_UNICODE);
14693 copy = PyMem_Malloc(size);
14694 if (copy == NULL) {
14695 PyErr_NoMemory();
14696 return NULL;
14697 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014698 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014699 return copy;
14700}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014701
Georg Brandl66c221e2010-10-14 07:04:07 +000014702/* A _string module, to export formatter_parser and formatter_field_name_split
14703 to the string.Formatter class implemented in Python. */
14704
14705static PyMethodDef _string_methods[] = {
14706 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14707 METH_O, PyDoc_STR("split the argument as a field name")},
14708 {"formatter_parser", (PyCFunction) formatter_parser,
14709 METH_O, PyDoc_STR("parse the argument as a format string")},
14710 {NULL, NULL}
14711};
14712
14713static struct PyModuleDef _string_module = {
14714 PyModuleDef_HEAD_INIT,
14715 "_string",
14716 PyDoc_STR("string helper module"),
14717 0,
14718 _string_methods,
14719 NULL,
14720 NULL,
14721 NULL,
14722 NULL
14723};
14724
14725PyMODINIT_FUNC
14726PyInit__string(void)
14727{
14728 return PyModule_Create(&_string_module);
14729}
14730
14731
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014732#ifdef __cplusplus
14733}
14734#endif