blob: 7f079e789972aa9d47561c52d4db0d76d6477cd5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner910337b2011-10-03 03:20:16 +020069#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020070# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020071#else
72# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
73#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020074
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075#define _PyUnicode_UTF8(op) \
76 (((PyCompactUnicodeObject*)(op))->utf8)
77#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020078 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079 assert(PyUnicode_IS_READY(op)), \
80 PyUnicode_IS_COMPACT_ASCII(op) ? \
81 ((char*)((PyASCIIObject*)(op) + 1)) : \
82 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020083#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 (((PyCompactUnicodeObject*)(op))->utf8_length)
85#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020086 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 assert(PyUnicode_IS_READY(op)), \
88 PyUnicode_IS_COMPACT_ASCII(op) ? \
89 ((PyASCIIObject*)(op))->length : \
90 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020091#define _PyUnicode_WSTR(op) \
92 (((PyASCIIObject*)(op))->wstr)
93#define _PyUnicode_WSTR_LENGTH(op) \
94 (((PyCompactUnicodeObject*)(op))->wstr_length)
95#define _PyUnicode_LENGTH(op) \
96 (((PyASCIIObject *)(op))->length)
97#define _PyUnicode_STATE(op) \
98 (((PyASCIIObject *)(op))->state)
99#define _PyUnicode_HASH(op) \
100 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_KIND(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_GET_LENGTH(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200107#define _PyUnicode_DATA_ANY(op) \
108 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109
Victor Stinner910337b2011-10-03 03:20:16 +0200110#undef PyUnicode_READY
111#define PyUnicode_READY(op) \
112 (assert(_PyUnicode_CHECK(op)), \
113 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200114 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100115 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200116
Victor Stinnerc379ead2011-10-03 12:52:27 +0200117#define _PyUnicode_SHARE_UTF8(op) \
118 (assert(_PyUnicode_CHECK(op)), \
119 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
120 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
121#define _PyUnicode_SHARE_WSTR(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
124
Victor Stinner829c0ad2011-10-03 01:08:02 +0200125/* true if the Unicode object has an allocated UTF-8 memory block
126 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_HAS_UTF8_MEMORY(op) \
128 (assert(_PyUnicode_CHECK(op)), \
129 (!PyUnicode_IS_COMPACT_ASCII(op) \
130 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200131 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
132
Victor Stinner03490912011-10-03 23:45:12 +0200133/* true if the Unicode object has an allocated wstr memory block
134 (not shared with other data) */
135#define _PyUnicode_HAS_WSTR_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (_PyUnicode_WSTR(op) && \
138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200148 to_type *_to = (to_type *) to; \
149 const from_type *_iter = (begin); \
150 const from_type *_end = (end); \
151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
153 _iter + (n & ~ (Py_ssize_t) 3); \
154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200165/* The Unicode string has been modified: reset the hash */
166#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226
Alexander Belopolsky40018472011-02-26 01:02:56 +0000227static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200228unicode_fromascii(const unsigned char *s, Py_ssize_t size);
229static PyObject *
230_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
235
236static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000237unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000238 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100239 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
241
Alexander Belopolsky40018472011-02-26 01:02:56 +0000242static void
243raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300244 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100245 PyObject *unicode,
246 Py_ssize_t startpos, Py_ssize_t endpos,
247 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000248
Christian Heimes190d79e2008-01-30 11:58:22 +0000249/* Same for linebreaks */
250static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000253/* 0x000B, * LINE TABULATION */
254/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000257 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x001C, * FILE SEPARATOR */
259/* 0x001D, * GROUP SEPARATOR */
260/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 1, 1, 1, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000266
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000275};
276
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300277/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
278 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000279Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000280PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000282#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 /* This is actually an illegal character, so it should
286 not be passed to unichr. */
287 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288#endif
289}
290
Victor Stinner910337b2011-10-03 03:20:16 +0200291#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200292int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100293_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200294{
295 PyASCIIObject *ascii;
296 unsigned int kind;
297
298 assert(PyUnicode_Check(op));
299
300 ascii = (PyASCIIObject *)op;
301 kind = ascii->state.kind;
302
Victor Stinnera3b334d2011-10-03 13:53:37 +0200303 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200304 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200305 assert(ascii->state.ready == 1);
306 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200307 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200308 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200309 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200310
Victor Stinnera41463c2011-10-04 01:05:08 +0200311 if (ascii->state.compact == 1) {
312 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(kind == PyUnicode_1BYTE_KIND
314 || kind == PyUnicode_2BYTE_KIND
315 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200316 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 }
320 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
322
323 data = unicode->data.any;
324 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100325 assert(ascii->length == 0);
326 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->wstr != NULL);
332 assert(data == NULL);
333 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200370 /* check that the best kind is used */
371 if (check_content && kind != PyUnicode_WCHAR_KIND)
372 {
373 Py_ssize_t i;
374 Py_UCS4 maxchar = 0;
375 void *data = PyUnicode_DATA(ascii);
376 for (i=0; i < ascii->length; i++)
377 {
378 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
379 if (ch > maxchar)
380 maxchar = ch;
381 }
Victor Stinnerda29cc32011-11-21 14:31:41 +0100382 if (maxchar > 0x10FFFF) {
383 printf("Invalid Unicode string! {");
384 for (i=0; i < ascii->length; i++)
385 {
386 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
387 if (i)
388 printf(", U+%04x", ch);
389 else
390 printf("U+%04x", ch);
391 }
Victor Stinner5bbe5e72011-11-21 22:54:05 +0100392 printf("} (len=%lu)\n", ascii->length);
Victor Stinnerda29cc32011-11-21 14:31:41 +0100393 abort();
394 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100396 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100398 assert(maxchar <= 255);
399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 else
401 assert(maxchar < 128);
402 }
Victor Stinner77faf692011-11-20 18:56:05 +0100403 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100405 assert(maxchar <= 0xFFFF);
406 }
407 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 assert(maxchar >= 0x10000);
Victor Stinner77faf692011-11-20 18:56:05 +0100409 assert(maxchar <= 0x10FFFF);
410 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200411 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400412 return 1;
413}
Victor Stinner910337b2011-10-03 03:20:16 +0200414#endif
415
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100416static PyObject*
417unicode_result_wchar(PyObject *unicode)
418{
419#ifndef Py_DEBUG
420 Py_ssize_t len;
421
422 assert(Py_REFCNT(unicode) == 1);
423
424 len = _PyUnicode_WSTR_LENGTH(unicode);
425 if (len == 0) {
426 Py_INCREF(unicode_empty);
427 Py_DECREF(unicode);
428 return unicode_empty;
429 }
430
431 if (len == 1) {
432 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
433 if (ch < 256) {
434 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
435 Py_DECREF(unicode);
436 return latin1_char;
437 }
438 }
439
440 if (_PyUnicode_Ready(unicode) < 0) {
441 Py_XDECREF(unicode);
442 return NULL;
443 }
444#else
445 /* don't make the result ready in debug mode to ensure that the caller
446 makes the string ready before using it */
447 assert(_PyUnicode_CheckConsistency(unicode, 1));
448#endif
449 return unicode;
450}
451
452static PyObject*
453unicode_result_ready(PyObject *unicode)
454{
455 Py_ssize_t length;
456
457 length = PyUnicode_GET_LENGTH(unicode);
458 if (length == 0) {
459 if (unicode != unicode_empty) {
460 Py_INCREF(unicode_empty);
461 Py_DECREF(unicode);
462 }
463 return unicode_empty;
464 }
465
466 if (length == 1) {
467 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
468 if (ch < 256) {
469 PyObject *latin1_char = unicode_latin1[ch];
470 if (latin1_char != NULL) {
471 if (unicode != latin1_char) {
472 Py_INCREF(latin1_char);
473 Py_DECREF(unicode);
474 }
475 return latin1_char;
476 }
477 else {
478 assert(_PyUnicode_CheckConsistency(unicode, 1));
479 Py_INCREF(unicode);
480 unicode_latin1[ch] = unicode;
481 return unicode;
482 }
483 }
484 }
485
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 return unicode;
488}
489
490static PyObject*
491unicode_result(PyObject *unicode)
492{
493 assert(_PyUnicode_CHECK(unicode));
494 if (PyUnicode_IS_READY(unicode))
495 return unicode_result_ready(unicode);
496 else
497 return unicode_result_wchar(unicode);
498}
499
Victor Stinner3a50e702011-10-18 21:21:00 +0200500#ifdef HAVE_MBCS
501static OSVERSIONINFOEX winver;
502#endif
503
Thomas Wouters477c8d52006-05-27 19:21:47 +0000504/* --- Bloom Filters ----------------------------------------------------- */
505
506/* stuff to implement simple "bloom filters" for Unicode characters.
507 to keep things simple, we use a single bitmask, using the least 5
508 bits from each unicode characters as the bit index. */
509
510/* the linebreak mask is set up by Unicode_Init below */
511
Antoine Pitrouf068f942010-01-13 14:19:12 +0000512#if LONG_BIT >= 128
513#define BLOOM_WIDTH 128
514#elif LONG_BIT >= 64
515#define BLOOM_WIDTH 64
516#elif LONG_BIT >= 32
517#define BLOOM_WIDTH 32
518#else
519#error "LONG_BIT is smaller than 32"
520#endif
521
Thomas Wouters477c8d52006-05-27 19:21:47 +0000522#define BLOOM_MASK unsigned long
523
524static BLOOM_MASK bloom_linebreak;
525
Antoine Pitrouf068f942010-01-13 14:19:12 +0000526#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
527#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528
Benjamin Peterson29060642009-01-31 22:14:21 +0000529#define BLOOM_LINEBREAK(ch) \
530 ((ch) < 128U ? ascii_linebreak[(ch)] : \
531 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Alexander Belopolsky40018472011-02-26 01:02:56 +0000533Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535{
536 /* calculate simple bloom-style bitmask for a given unicode string */
537
Antoine Pitrouf068f942010-01-13 14:19:12 +0000538 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539 Py_ssize_t i;
540
541 mask = 0;
542 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544
545 return mask;
546}
547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200548#define BLOOM_MEMBER(mask, chr, str) \
549 (BLOOM(mask, chr) \
550 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200552/* Compilation of templated routines */
553
554#include "stringlib/asciilib.h"
555#include "stringlib/fastsearch.h"
556#include "stringlib/partition.h"
557#include "stringlib/split.h"
558#include "stringlib/count.h"
559#include "stringlib/find.h"
560#include "stringlib/find_max_char.h"
561#include "stringlib/localeutil.h"
562#include "stringlib/undef.h"
563
564#include "stringlib/ucs1lib.h"
565#include "stringlib/fastsearch.h"
566#include "stringlib/partition.h"
567#include "stringlib/split.h"
568#include "stringlib/count.h"
569#include "stringlib/find.h"
570#include "stringlib/find_max_char.h"
571#include "stringlib/localeutil.h"
572#include "stringlib/undef.h"
573
574#include "stringlib/ucs2lib.h"
575#include "stringlib/fastsearch.h"
576#include "stringlib/partition.h"
577#include "stringlib/split.h"
578#include "stringlib/count.h"
579#include "stringlib/find.h"
580#include "stringlib/find_max_char.h"
581#include "stringlib/localeutil.h"
582#include "stringlib/undef.h"
583
584#include "stringlib/ucs4lib.h"
585#include "stringlib/fastsearch.h"
586#include "stringlib/partition.h"
587#include "stringlib/split.h"
588#include "stringlib/count.h"
589#include "stringlib/find.h"
590#include "stringlib/find_max_char.h"
591#include "stringlib/localeutil.h"
592#include "stringlib/undef.h"
593
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200594#include "stringlib/unicodedefs.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100598#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200599
Guido van Rossumd57fd912000-03-10 22:53:23 +0000600/* --- Unicode Object ----------------------------------------------------- */
601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200602static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200603fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200604
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200605Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
606 Py_ssize_t size, Py_UCS4 ch,
607 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
610
611 switch (kind) {
612 case PyUnicode_1BYTE_KIND:
613 {
614 Py_UCS1 ch1 = (Py_UCS1) ch;
615 if (ch1 == ch)
616 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
617 else
618 return -1;
619 }
620 case PyUnicode_2BYTE_KIND:
621 {
622 Py_UCS2 ch2 = (Py_UCS2) ch;
623 if (ch2 == ch)
624 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
625 else
626 return -1;
627 }
628 case PyUnicode_4BYTE_KIND:
629 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
630 default:
631 assert(0);
632 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634}
635
Victor Stinnerfe226c02011-10-03 03:52:20 +0200636static PyObject*
637resize_compact(PyObject *unicode, Py_ssize_t length)
638{
639 Py_ssize_t char_size;
640 Py_ssize_t struct_size;
641 Py_ssize_t new_size;
642 int share_wstr;
643
644 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200645 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200646 if (PyUnicode_IS_COMPACT_ASCII(unicode))
647 struct_size = sizeof(PyASCIIObject);
648 else
649 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200650 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200651
652 _Py_DEC_REFTOTAL;
653 _Py_ForgetReference(unicode);
654
655 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
656 PyErr_NoMemory();
657 return NULL;
658 }
659 new_size = (struct_size + (length + 1) * char_size);
660
661 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
662 if (unicode == NULL) {
663 PyObject_Del(unicode);
664 PyErr_NoMemory();
665 return NULL;
666 }
667 _Py_NewReference(unicode);
668 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200669 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200671 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
672 _PyUnicode_WSTR_LENGTH(unicode) = length;
673 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
675 length, 0);
676 return unicode;
677}
678
Alexander Belopolsky40018472011-02-26 01:02:56 +0000679static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200680resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681{
Victor Stinner95663112011-10-04 01:03:50 +0200682 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200683 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000685
Victor Stinner95663112011-10-04 01:03:50 +0200686 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200687
688 if (PyUnicode_IS_READY(unicode)) {
689 Py_ssize_t char_size;
690 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200691 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 void *data;
693
694 data = _PyUnicode_DATA_ANY(unicode);
695 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200696 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200697 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
698 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200699 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
700 {
701 PyObject_DEL(_PyUnicode_UTF8(unicode));
702 _PyUnicode_UTF8(unicode) = NULL;
703 _PyUnicode_UTF8_LENGTH(unicode) = 0;
704 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705
706 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
707 PyErr_NoMemory();
708 return -1;
709 }
710 new_size = (length + 1) * char_size;
711
712 data = (PyObject *)PyObject_REALLOC(data, new_size);
713 if (data == NULL) {
714 PyErr_NoMemory();
715 return -1;
716 }
717 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200720 _PyUnicode_WSTR_LENGTH(unicode) = length;
721 }
722 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200723 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 _PyUnicode_UTF8_LENGTH(unicode) = length;
725 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 _PyUnicode_LENGTH(unicode) = length;
727 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200728 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200729 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 }
Victor Stinner95663112011-10-04 01:03:50 +0200733 assert(_PyUnicode_WSTR(unicode) != NULL);
734
735 /* check for integer overflow */
736 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
737 PyErr_NoMemory();
738 return -1;
739 }
740 wstr = _PyUnicode_WSTR(unicode);
741 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
742 if (!wstr) {
743 PyErr_NoMemory();
744 return -1;
745 }
746 _PyUnicode_WSTR(unicode) = wstr;
747 _PyUnicode_WSTR(unicode)[length] = 0;
748 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200749 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000750 return 0;
751}
752
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753static PyObject*
754resize_copy(PyObject *unicode, Py_ssize_t length)
755{
756 Py_ssize_t copy_length;
757 if (PyUnicode_IS_COMPACT(unicode)) {
758 PyObject *copy;
759 assert(PyUnicode_IS_READY(unicode));
760
761 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
762 if (copy == NULL)
763 return NULL;
764
765 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200766 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200768 }
769 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200770 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 assert(_PyUnicode_WSTR(unicode) != NULL);
772 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200773 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 if (w == NULL)
775 return NULL;
776 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
777 copy_length = Py_MIN(copy_length, length);
778 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
779 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200780 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 }
782}
783
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000785 Ux0000 terminated; some code (e.g. new_identifier)
786 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000787
788 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000789 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000790
791*/
792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200793#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200794static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795#endif
796
Alexander Belopolsky40018472011-02-26 01:02:56 +0000797static PyUnicodeObject *
798_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000799{
800 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802
Thomas Wouters477c8d52006-05-27 19:21:47 +0000803 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804 if (length == 0 && unicode_empty != NULL) {
805 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200806 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807 }
808
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000809 /* Ensure we won't overflow the size. */
810 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
811 return (PyUnicodeObject *)PyErr_NoMemory();
812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813 if (length < 0) {
814 PyErr_SetString(PyExc_SystemError,
815 "Negative size passed to _PyUnicode_New");
816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817 }
818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819#ifdef Py_DEBUG
820 ++unicode_old_new_calls;
821#endif
822
823 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
824 if (unicode == NULL)
825 return NULL;
826 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
827 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
828 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000829 PyErr_NoMemory();
830 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832
Jeremy Hyltond8082792003-09-16 19:41:39 +0000833 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000834 * the caller fails before initializing str -- unicode_resize()
835 * reads str[0], and the Keep-Alive optimization can keep memory
836 * allocated for str alive across a call to unicode_dealloc(unicode).
837 * We don't want unicode_resize to read uninitialized memory in
838 * that case.
839 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 _PyUnicode_WSTR(unicode)[0] = 0;
841 _PyUnicode_WSTR(unicode)[length] = 0;
842 _PyUnicode_WSTR_LENGTH(unicode) = length;
843 _PyUnicode_HASH(unicode) = -1;
844 _PyUnicode_STATE(unicode).interned = 0;
845 _PyUnicode_STATE(unicode).kind = 0;
846 _PyUnicode_STATE(unicode).compact = 0;
847 _PyUnicode_STATE(unicode).ready = 0;
848 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200849 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200851 _PyUnicode_UTF8(unicode) = NULL;
852 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100853 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000854 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000855
Benjamin Peterson29060642009-01-31 22:14:21 +0000856 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000857 /* XXX UNREF/NEWREF interface should be more symmetrical */
858 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000859 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000860 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862}
863
Victor Stinnerf42dc442011-10-02 23:33:16 +0200864static const char*
865unicode_kind_name(PyObject *unicode)
866{
Victor Stinner42dfd712011-10-03 14:41:45 +0200867 /* don't check consistency: unicode_kind_name() is called from
868 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200869 if (!PyUnicode_IS_COMPACT(unicode))
870 {
871 if (!PyUnicode_IS_READY(unicode))
872 return "wstr";
873 switch(PyUnicode_KIND(unicode))
874 {
875 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200876 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200877 return "legacy ascii";
878 else
879 return "legacy latin1";
880 case PyUnicode_2BYTE_KIND:
881 return "legacy UCS2";
882 case PyUnicode_4BYTE_KIND:
883 return "legacy UCS4";
884 default:
885 return "<legacy invalid kind>";
886 }
887 }
888 assert(PyUnicode_IS_READY(unicode));
889 switch(PyUnicode_KIND(unicode))
890 {
891 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200892 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200893 return "ascii";
894 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200895 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200897 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200898 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200899 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 default:
901 return "<invalid compact kind>";
902 }
903}
904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200906static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200907
908/* Functions wrapping macros for use in debugger */
909char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200910 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911}
912
913void *_PyUnicode_compact_data(void *unicode) {
914 return _PyUnicode_COMPACT_DATA(unicode);
915}
916void *_PyUnicode_data(void *unicode){
917 printf("obj %p\n", unicode);
918 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
919 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
920 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
921 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
922 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
923 return PyUnicode_DATA(unicode);
924}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200925
926void
927_PyUnicode_Dump(PyObject *op)
928{
929 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200930 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
931 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
932 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200933
Victor Stinnera849a4b2011-10-03 12:12:11 +0200934 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200935 {
936 if (ascii->state.ascii)
937 data = (ascii + 1);
938 else
939 data = (compact + 1);
940 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200941 else
942 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200943 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
944
Victor Stinnera849a4b2011-10-03 12:12:11 +0200945 if (ascii->wstr == data)
946 printf("shared ");
947 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200948
Victor Stinnera3b334d2011-10-03 13:53:37 +0200949 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 printf(" (%zu), ", compact->wstr_length);
951 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
952 printf("shared ");
953 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200954 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200956}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957#endif
958
959PyObject *
960PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
961{
962 PyObject *obj;
963 PyCompactUnicodeObject *unicode;
964 void *data;
965 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200966 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 Py_ssize_t char_size;
968 Py_ssize_t struct_size;
969
970 /* Optimization for empty strings */
971 if (size == 0 && unicode_empty != NULL) {
972 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200973 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 }
975
976#ifdef Py_DEBUG
977 ++unicode_new_new_calls;
978#endif
979
Victor Stinner9e9d6892011-10-04 01:02:02 +0200980 is_ascii = 0;
981 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 struct_size = sizeof(PyCompactUnicodeObject);
983 if (maxchar < 128) {
984 kind_state = PyUnicode_1BYTE_KIND;
985 char_size = 1;
986 is_ascii = 1;
987 struct_size = sizeof(PyASCIIObject);
988 }
989 else if (maxchar < 256) {
990 kind_state = PyUnicode_1BYTE_KIND;
991 char_size = 1;
992 }
993 else if (maxchar < 65536) {
994 kind_state = PyUnicode_2BYTE_KIND;
995 char_size = 2;
996 if (sizeof(wchar_t) == 2)
997 is_sharing = 1;
998 }
999 else {
1000 kind_state = PyUnicode_4BYTE_KIND;
1001 char_size = 4;
1002 if (sizeof(wchar_t) == 4)
1003 is_sharing = 1;
1004 }
1005
1006 /* Ensure we won't overflow the size. */
1007 if (size < 0) {
1008 PyErr_SetString(PyExc_SystemError,
1009 "Negative size passed to PyUnicode_New");
1010 return NULL;
1011 }
1012 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1013 return PyErr_NoMemory();
1014
1015 /* Duplicated allocation code from _PyObject_New() instead of a call to
1016 * PyObject_New() so we are able to allocate space for the object and
1017 * it's data buffer.
1018 */
1019 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1020 if (obj == NULL)
1021 return PyErr_NoMemory();
1022 obj = PyObject_INIT(obj, &PyUnicode_Type);
1023 if (obj == NULL)
1024 return NULL;
1025
1026 unicode = (PyCompactUnicodeObject *)obj;
1027 if (is_ascii)
1028 data = ((PyASCIIObject*)obj) + 1;
1029 else
1030 data = unicode + 1;
1031 _PyUnicode_LENGTH(unicode) = size;
1032 _PyUnicode_HASH(unicode) = -1;
1033 _PyUnicode_STATE(unicode).interned = 0;
1034 _PyUnicode_STATE(unicode).kind = kind_state;
1035 _PyUnicode_STATE(unicode).compact = 1;
1036 _PyUnicode_STATE(unicode).ready = 1;
1037 _PyUnicode_STATE(unicode).ascii = is_ascii;
1038 if (is_ascii) {
1039 ((char*)data)[size] = 0;
1040 _PyUnicode_WSTR(unicode) = NULL;
1041 }
1042 else if (kind_state == PyUnicode_1BYTE_KIND) {
1043 ((char*)data)[size] = 0;
1044 _PyUnicode_WSTR(unicode) = NULL;
1045 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 }
1049 else {
1050 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001051 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 if (kind_state == PyUnicode_2BYTE_KIND)
1053 ((Py_UCS2*)data)[size] = 0;
1054 else /* kind_state == PyUnicode_4BYTE_KIND */
1055 ((Py_UCS4*)data)[size] = 0;
1056 if (is_sharing) {
1057 _PyUnicode_WSTR_LENGTH(unicode) = size;
1058 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1059 }
1060 else {
1061 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1062 _PyUnicode_WSTR(unicode) = NULL;
1063 }
1064 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001065 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 return obj;
1067}
1068
1069#if SIZEOF_WCHAR_T == 2
1070/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1071 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001072 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073
1074 This function assumes that unicode can hold one more code point than wstr
1075 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001076static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001078 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079{
1080 const wchar_t *iter;
1081 Py_UCS4 *ucs4_out;
1082
Victor Stinner910337b2011-10-03 03:20:16 +02001083 assert(unicode != NULL);
1084 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1086 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1087
1088 for (iter = begin; iter < end; ) {
1089 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1090 _PyUnicode_GET_LENGTH(unicode)));
1091 if (*iter >= 0xD800 && *iter <= 0xDBFF
1092 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1093 {
1094 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1095 iter += 2;
1096 }
1097 else {
1098 *ucs4_out++ = *iter;
1099 iter++;
1100 }
1101 }
1102 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1103 _PyUnicode_GET_LENGTH(unicode)));
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105}
1106#endif
1107
Victor Stinnercd9950f2011-10-02 00:34:53 +02001108static int
1109_PyUnicode_Dirty(PyObject *unicode)
1110{
Victor Stinner910337b2011-10-03 03:20:16 +02001111 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001113 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001114 "Cannot modify a string having more than 1 reference");
1115 return -1;
1116 }
1117 _PyUnicode_DIRTY(unicode);
1118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
1266 if (PyUnicode_READY(from))
1267 return -1;
1268 if (PyUnicode_READY(to))
1269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
1283 if (_PyUnicode_Dirty(to))
1284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
1308
Victor Stinnerc53be962011-10-02 21:33:54 +02001309 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 *num_surrogates = 0;
1311 *maxchar = 0;
1312
1313 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001314 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001316#if SIZEOF_WCHAR_T != 2
1317 if (*maxchar >= 0x10000)
1318 return 0;
1319#endif
1320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321#if SIZEOF_WCHAR_T == 2
1322 if (*iter >= 0xD800 && *iter <= 0xDBFF
1323 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1324 {
1325 Py_UCS4 surrogate_val;
1326 surrogate_val = (((iter[0] & 0x3FF)<<10)
1327 | (iter[1] & 0x3FF)) + 0x10000;
1328 ++(*num_surrogates);
1329 if (surrogate_val > *maxchar)
1330 *maxchar = surrogate_val;
1331 iter += 2;
1332 }
1333 else
1334 iter++;
1335#else
1336 iter++;
1337#endif
1338 }
1339 return 0;
1340}
1341
1342#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001343static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344#endif
1345
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001346int
1347_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348{
1349 wchar_t *end;
1350 Py_UCS4 maxchar = 0;
1351 Py_ssize_t num_surrogates;
1352#if SIZEOF_WCHAR_T == 2
1353 Py_ssize_t length_wo_surrogates;
1354#endif
1355
Georg Brandl7597add2011-10-05 16:36:47 +02001356 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001357 strings were created using _PyObject_New() and where no canonical
1358 representation (the str field) has been set yet aka strings
1359 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001360 assert(_PyUnicode_CHECK(unicode));
1361 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001363 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001364 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001365 /* Actually, it should neither be interned nor be anything else: */
1366 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367
1368#ifdef Py_DEBUG
1369 ++unicode_ready_calls;
1370#endif
1371
1372 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001373 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001374 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376
1377 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001378 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1379 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 PyErr_NoMemory();
1381 return -1;
1382 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001383 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 _PyUnicode_WSTR(unicode), end,
1385 PyUnicode_1BYTE_DATA(unicode));
1386 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1387 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1388 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1389 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001390 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001391 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001392 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 }
1394 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001395 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001396 _PyUnicode_UTF8(unicode) = NULL;
1397 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398 }
1399 PyObject_FREE(_PyUnicode_WSTR(unicode));
1400 _PyUnicode_WSTR(unicode) = NULL;
1401 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1402 }
1403 /* In this case we might have to convert down from 4-byte native
1404 wchar_t to 2-byte unicode. */
1405 else if (maxchar < 65536) {
1406 assert(num_surrogates == 0 &&
1407 "FindMaxCharAndNumSurrogatePairs() messed up");
1408
Victor Stinner506f5922011-09-28 22:34:18 +02001409#if SIZEOF_WCHAR_T == 2
1410 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001411 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001412 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1413 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1414 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001415 _PyUnicode_UTF8(unicode) = NULL;
1416 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001417#else
1418 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001420 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001421 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001422 PyErr_NoMemory();
1423 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 }
Victor Stinner506f5922011-09-28 22:34:18 +02001425 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1426 _PyUnicode_WSTR(unicode), end,
1427 PyUnicode_2BYTE_DATA(unicode));
1428 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1429 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1430 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001431 _PyUnicode_UTF8(unicode) = NULL;
1432 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001433 PyObject_FREE(_PyUnicode_WSTR(unicode));
1434 _PyUnicode_WSTR(unicode) = NULL;
1435 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1436#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1439 else {
1440#if SIZEOF_WCHAR_T == 2
1441 /* in case the native representation is 2-bytes, we need to allocate a
1442 new normalized 4-byte version. */
1443 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001444 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1445 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 PyErr_NoMemory();
1447 return -1;
1448 }
1449 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1450 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001451 _PyUnicode_UTF8(unicode) = NULL;
1452 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001453 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1454 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001455 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 PyObject_FREE(_PyUnicode_WSTR(unicode));
1457 _PyUnicode_WSTR(unicode) = NULL;
1458 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1459#else
1460 assert(num_surrogates == 0);
1461
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001464 _PyUnicode_UTF8(unicode) = NULL;
1465 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1467#endif
1468 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1469 }
1470 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001471 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 return 0;
1473}
1474
Alexander Belopolsky40018472011-02-26 01:02:56 +00001475static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001476unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477{
Walter Dörwald16807132007-05-25 13:52:07 +00001478 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 case SSTATE_NOT_INTERNED:
1480 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001481
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 case SSTATE_INTERNED_MORTAL:
1483 /* revive dead object temporarily for DelItem */
1484 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001485 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001486 Py_FatalError(
1487 "deletion of interned string failed");
1488 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001489
Benjamin Peterson29060642009-01-31 22:14:21 +00001490 case SSTATE_INTERNED_IMMORTAL:
1491 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001492
Benjamin Peterson29060642009-01-31 22:14:21 +00001493 default:
1494 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001495 }
1496
Victor Stinner03490912011-10-03 23:45:12 +02001497 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001499 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001500 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501
1502 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001503 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 }
1505 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001506 if (_PyUnicode_DATA_ANY(unicode))
1507 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001508 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 }
1510}
1511
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001512#ifdef Py_DEBUG
1513static int
1514unicode_is_singleton(PyObject *unicode)
1515{
1516 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1517 if (unicode == unicode_empty)
1518 return 1;
1519 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1520 {
1521 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1522 if (ch < 256 && unicode_latin1[ch] == unicode)
1523 return 1;
1524 }
1525 return 0;
1526}
1527#endif
1528
Alexander Belopolsky40018472011-02-26 01:02:56 +00001529static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001530unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001531{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 if (Py_REFCNT(unicode) != 1)
1533 return 0;
1534 if (PyUnicode_CHECK_INTERNED(unicode))
1535 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001536#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001537 /* singleton refcount is greater than 1 */
1538 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001539#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001540 return 1;
1541}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001542
Victor Stinnerfe226c02011-10-03 03:52:20 +02001543static int
1544unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1545{
1546 PyObject *unicode;
1547 Py_ssize_t old_length;
1548
1549 assert(p_unicode != NULL);
1550 unicode = *p_unicode;
1551
1552 assert(unicode != NULL);
1553 assert(PyUnicode_Check(unicode));
1554 assert(0 <= length);
1555
Victor Stinner910337b2011-10-03 03:20:16 +02001556 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001557 old_length = PyUnicode_WSTR_LENGTH(unicode);
1558 else
1559 old_length = PyUnicode_GET_LENGTH(unicode);
1560 if (old_length == length)
1561 return 0;
1562
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001563 if (length == 0) {
1564 Py_DECREF(*p_unicode);
1565 *p_unicode = unicode_empty;
1566 Py_INCREF(*p_unicode);
1567 return 0;
1568 }
1569
Victor Stinnerfe226c02011-10-03 03:52:20 +02001570 if (!unicode_resizable(unicode)) {
1571 PyObject *copy = resize_copy(unicode, length);
1572 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001574 Py_DECREF(*p_unicode);
1575 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001576 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001577 }
1578
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579 if (PyUnicode_IS_COMPACT(unicode)) {
1580 *p_unicode = resize_compact(unicode, length);
1581 if (*p_unicode == NULL)
1582 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001583 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001585 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001586 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001587}
1588
Alexander Belopolsky40018472011-02-26 01:02:56 +00001589int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001590PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001591{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592 PyObject *unicode;
1593 if (p_unicode == NULL) {
1594 PyErr_BadInternalCall();
1595 return -1;
1596 }
1597 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001598 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 {
1600 PyErr_BadInternalCall();
1601 return -1;
1602 }
1603 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001604}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001605
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001606static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001607unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608{
1609 PyObject *result;
1610 assert(PyUnicode_IS_READY(*p_unicode));
1611 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1612 return 0;
1613 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1614 maxchar);
1615 if (result == NULL)
1616 return -1;
1617 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1618 PyUnicode_GET_LENGTH(*p_unicode));
1619 Py_DECREF(*p_unicode);
1620 *p_unicode = result;
1621 return 0;
1622}
1623
1624static int
1625unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1626 Py_UCS4 ch)
1627{
1628 if (unicode_widen(p_unicode, ch) < 0)
1629 return -1;
1630 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1631 PyUnicode_DATA(*p_unicode),
1632 (*pos)++, ch);
1633 return 0;
1634}
1635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636static PyObject*
1637get_latin1_char(unsigned char ch)
1638{
Victor Stinnera464fc12011-10-02 20:39:30 +02001639 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode)
1643 return NULL;
1644 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001645 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 unicode_latin1[ch] = unicode;
1647 }
1648 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001649 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650}
1651
Alexander Belopolsky40018472011-02-26 01:02:56 +00001652PyObject *
1653PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001655 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 Py_UCS4 maxchar = 0;
1657 Py_ssize_t num_surrogates;
1658
1659 if (u == NULL)
1660 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001662 /* If the Unicode data is known at construction time, we can apply
1663 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 /* Optimization for empty strings */
1666 if (size == 0 && unicode_empty != NULL) {
1667 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001668 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001669 }
Tim Petersced69f82003-09-16 20:30:58 +00001670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 /* Single character Unicode objects in the Latin-1 range are
1672 shared when using this constructor */
1673 if (size == 1 && *u < 256)
1674 return get_latin1_char((unsigned char)*u);
1675
1676 /* If not empty and not single character, copy the Unicode data
1677 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001678 if (find_maxchar_surrogates(u, u + size,
1679 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 return NULL;
1681
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001682 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 if (!unicode)
1685 return NULL;
1686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 switch (PyUnicode_KIND(unicode)) {
1688 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001689 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1691 break;
1692 case PyUnicode_2BYTE_KIND:
1693#if Py_UNICODE_SIZE == 2
1694 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1695#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001696 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1698#endif
1699 break;
1700 case PyUnicode_4BYTE_KIND:
1701#if SIZEOF_WCHAR_T == 2
1702 /* This is the only case which has to process surrogates, thus
1703 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001704 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705#else
1706 assert(num_surrogates == 0);
1707 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1708#endif
1709 break;
1710 default:
1711 assert(0 && "Impossible state");
1712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001714 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715}
1716
Alexander Belopolsky40018472011-02-26 01:02:56 +00001717PyObject *
1718PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001719{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001720 if (size < 0) {
1721 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001722 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001723 return NULL;
1724 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001725
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001726 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001727 some optimizations which share commonly used objects.
1728 Also, this means the input must be UTF-8, so fall back to the
1729 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730 if (u != NULL) {
1731
Benjamin Peterson29060642009-01-31 22:14:21 +00001732 /* Optimization for empty strings */
1733 if (size == 0 && unicode_empty != NULL) {
1734 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001735 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001736 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001737
1738 /* Single characters are shared when using this constructor.
1739 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001740 if (size == 1 && (unsigned char)*u < 128)
1741 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001742
1743 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001744 }
1745
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001746 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001747}
1748
Alexander Belopolsky40018472011-02-26 01:02:56 +00001749PyObject *
1750PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001751{
1752 size_t size = strlen(u);
1753 if (size > PY_SSIZE_T_MAX) {
1754 PyErr_SetString(PyExc_OverflowError, "input too long");
1755 return NULL;
1756 }
1757
1758 return PyUnicode_FromStringAndSize(u, size);
1759}
1760
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001761PyObject *
1762_PyUnicode_FromId(_Py_Identifier *id)
1763{
1764 if (!id->object) {
1765 id->object = PyUnicode_FromString(id->string);
1766 if (!id->object)
1767 return NULL;
1768 PyUnicode_InternInPlace(&id->object);
1769 assert(!id->next);
1770 id->next = static_strings;
1771 static_strings = id;
1772 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001773 return id->object;
1774}
1775
1776void
1777_PyUnicode_ClearStaticStrings()
1778{
1779 _Py_Identifier *i;
1780 for (i = static_strings; i; i = i->next) {
1781 Py_DECREF(i->object);
1782 i->object = NULL;
1783 i->next = NULL;
1784 }
1785}
1786
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001787/* Internal function, don't check maximum character */
1788
Victor Stinnere57b1c02011-09-28 22:20:48 +02001789static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001790unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001791{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001792 PyObject *res;
1793#ifdef Py_DEBUG
1794 const unsigned char *p;
1795 const unsigned char *end = s + size;
1796 for (p=s; p < end; p++) {
1797 assert(*p < 128);
1798 }
1799#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001800 if (size == 1)
1801 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001802 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001803 if (!res)
1804 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001805 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001806 return res;
1807}
1808
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001809static Py_UCS4
1810kind_maxchar_limit(unsigned int kind)
1811{
1812 switch(kind) {
1813 case PyUnicode_1BYTE_KIND:
1814 return 0x80;
1815 case PyUnicode_2BYTE_KIND:
1816 return 0x100;
1817 case PyUnicode_4BYTE_KIND:
1818 return 0x10000;
1819 default:
1820 assert(0 && "invalid kind");
1821 return 0x10ffff;
1822 }
1823}
1824
Victor Stinner702c7342011-10-05 13:50:52 +02001825static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001826_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001829 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001830
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001831 if (size == 0) {
1832 Py_INCREF(unicode_empty);
1833 return unicode_empty;
1834 }
1835 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001836 if (size == 1)
1837 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001838
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001839 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001840 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 if (!res)
1842 return NULL;
1843 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001844 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001846}
1847
Victor Stinnere57b1c02011-09-28 22:20:48 +02001848static PyObject*
1849_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850{
1851 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001852 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001853
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001854 if (size == 0) {
1855 Py_INCREF(unicode_empty);
1856 return unicode_empty;
1857 }
1858 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001859 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001860 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001861
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001862 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 if (!res)
1865 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001866 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001868 else {
1869 _PyUnicode_CONVERT_BYTES(
1870 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1871 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001872 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 return res;
1874}
1875
Victor Stinnere57b1c02011-09-28 22:20:48 +02001876static PyObject*
1877_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878{
1879 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001880 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001881
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001882 if (size == 0) {
1883 Py_INCREF(unicode_empty);
1884 return unicode_empty;
1885 }
1886 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001887 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001888 return get_latin1_char((unsigned char)u[0]);
1889
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001890 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001891 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 if (!res)
1893 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001894 if (max_char < 256)
1895 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1896 PyUnicode_1BYTE_DATA(res));
1897 else if (max_char < 0x10000)
1898 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1899 PyUnicode_2BYTE_DATA(res));
1900 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001902 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 return res;
1904}
1905
1906PyObject*
1907PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1908{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001909 if (size < 0) {
1910 PyErr_SetString(PyExc_ValueError, "size must be positive");
1911 return NULL;
1912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 switch(kind) {
1914 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001915 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001917 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001919 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001920 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001921 PyErr_SetString(PyExc_SystemError, "invalid kind");
1922 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001924}
1925
Victor Stinner25a4b292011-10-06 12:31:55 +02001926/* Ensure that a string uses the most efficient storage, if it is not the
1927 case: create a new string with of the right kind. Write NULL into *p_unicode
1928 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001929static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001930unicode_adjust_maxchar(PyObject **p_unicode)
1931{
1932 PyObject *unicode, *copy;
1933 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001934 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001935 unsigned int kind;
1936
1937 assert(p_unicode != NULL);
1938 unicode = *p_unicode;
1939 assert(PyUnicode_IS_READY(unicode));
1940 if (PyUnicode_IS_ASCII(unicode))
1941 return;
1942
1943 len = PyUnicode_GET_LENGTH(unicode);
1944 kind = PyUnicode_KIND(unicode);
1945 if (kind == PyUnicode_1BYTE_KIND) {
1946 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001947 max_char = ucs1lib_find_max_char(u, u + len);
1948 if (max_char >= 128)
1949 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001950 }
1951 else if (kind == PyUnicode_2BYTE_KIND) {
1952 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001953 max_char = ucs2lib_find_max_char(u, u + len);
1954 if (max_char >= 256)
1955 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001956 }
1957 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001958 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001959 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001960 max_char = ucs4lib_find_max_char(u, u + len);
1961 if (max_char >= 0x10000)
1962 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001963 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001964 copy = PyUnicode_New(len, max_char);
1965 copy_characters(copy, 0, unicode, 0, len);
1966 Py_DECREF(unicode);
1967 *p_unicode = copy;
1968}
1969
Victor Stinner034f6cf2011-09-30 02:26:44 +02001970PyObject*
1971PyUnicode_Copy(PyObject *unicode)
1972{
Victor Stinner87af4f22011-11-21 23:03:47 +01001973 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001975
Victor Stinner034f6cf2011-09-30 02:26:44 +02001976 if (!PyUnicode_Check(unicode)) {
1977 PyErr_BadInternalCall();
1978 return NULL;
1979 }
1980 if (PyUnicode_READY(unicode))
1981 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001982
Victor Stinner87af4f22011-11-21 23:03:47 +01001983 length = PyUnicode_GET_LENGTH(unicode);
1984 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001985 if (!copy)
1986 return NULL;
1987 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1988
Victor Stinner87af4f22011-11-21 23:03:47 +01001989 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1990 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001991 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001992 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001993}
1994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995
Victor Stinnerbc603d12011-10-02 01:00:40 +02001996/* Widen Unicode objects to larger buffers. Don't write terminating null
1997 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998
1999void*
2000_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2001{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002002 Py_ssize_t len;
2003 void *result;
2004 unsigned int skind;
2005
2006 if (PyUnicode_READY(s))
2007 return NULL;
2008
2009 len = PyUnicode_GET_LENGTH(s);
2010 skind = PyUnicode_KIND(s);
2011 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002012 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return NULL;
2014 }
2015 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002016 case PyUnicode_2BYTE_KIND:
2017 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2018 if (!result)
2019 return PyErr_NoMemory();
2020 assert(skind == PyUnicode_1BYTE_KIND);
2021 _PyUnicode_CONVERT_BYTES(
2022 Py_UCS1, Py_UCS2,
2023 PyUnicode_1BYTE_DATA(s),
2024 PyUnicode_1BYTE_DATA(s) + len,
2025 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002027 case PyUnicode_4BYTE_KIND:
2028 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2029 if (!result)
2030 return PyErr_NoMemory();
2031 if (skind == PyUnicode_2BYTE_KIND) {
2032 _PyUnicode_CONVERT_BYTES(
2033 Py_UCS2, Py_UCS4,
2034 PyUnicode_2BYTE_DATA(s),
2035 PyUnicode_2BYTE_DATA(s) + len,
2036 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002038 else {
2039 assert(skind == PyUnicode_1BYTE_KIND);
2040 _PyUnicode_CONVERT_BYTES(
2041 Py_UCS1, Py_UCS4,
2042 PyUnicode_1BYTE_DATA(s),
2043 PyUnicode_1BYTE_DATA(s) + len,
2044 result);
2045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002047 default:
2048 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 }
Victor Stinner01698042011-10-04 00:04:26 +02002050 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 return NULL;
2052}
2053
2054static Py_UCS4*
2055as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2056 int copy_null)
2057{
2058 int kind;
2059 void *data;
2060 Py_ssize_t len, targetlen;
2061 if (PyUnicode_READY(string) == -1)
2062 return NULL;
2063 kind = PyUnicode_KIND(string);
2064 data = PyUnicode_DATA(string);
2065 len = PyUnicode_GET_LENGTH(string);
2066 targetlen = len;
2067 if (copy_null)
2068 targetlen++;
2069 if (!target) {
2070 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2071 PyErr_NoMemory();
2072 return NULL;
2073 }
2074 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2075 if (!target) {
2076 PyErr_NoMemory();
2077 return NULL;
2078 }
2079 }
2080 else {
2081 if (targetsize < targetlen) {
2082 PyErr_Format(PyExc_SystemError,
2083 "string is longer than the buffer");
2084 if (copy_null && 0 < targetsize)
2085 target[0] = 0;
2086 return NULL;
2087 }
2088 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002089 if (kind == PyUnicode_1BYTE_KIND) {
2090 Py_UCS1 *start = (Py_UCS1 *) data;
2091 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002093 else if (kind == PyUnicode_2BYTE_KIND) {
2094 Py_UCS2 *start = (Py_UCS2 *) data;
2095 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2096 }
2097 else {
2098 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 if (copy_null)
2102 target[len] = 0;
2103 return target;
2104}
2105
2106Py_UCS4*
2107PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2108 int copy_null)
2109{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002110 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002111 PyErr_BadInternalCall();
2112 return NULL;
2113 }
2114 return as_ucs4(string, target, targetsize, copy_null);
2115}
2116
2117Py_UCS4*
2118PyUnicode_AsUCS4Copy(PyObject *string)
2119{
2120 return as_ucs4(string, NULL, 0, 1);
2121}
2122
2123#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002124
Alexander Belopolsky40018472011-02-26 01:02:56 +00002125PyObject *
2126PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002129 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002131 PyErr_BadInternalCall();
2132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 }
2134
Martin v. Löwis790465f2008-04-05 20:41:37 +00002135 if (size == -1) {
2136 size = wcslen(w);
2137 }
2138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140}
2141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002143
Walter Dörwald346737f2007-05-31 10:44:43 +00002144static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002145makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2146 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002147{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002148 *fmt++ = '%';
2149 if (width) {
2150 if (zeropad)
2151 *fmt++ = '0';
2152 fmt += sprintf(fmt, "%d", width);
2153 }
2154 if (precision)
2155 fmt += sprintf(fmt, ".%d", precision);
2156 if (longflag)
2157 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002158 else if (longlongflag) {
2159 /* longlongflag should only ever be nonzero on machines with
2160 HAVE_LONG_LONG defined */
2161#ifdef HAVE_LONG_LONG
2162 char *f = PY_FORMAT_LONG_LONG;
2163 while (*f)
2164 *fmt++ = *f++;
2165#else
2166 /* we shouldn't ever get here */
2167 assert(0);
2168 *fmt++ = 'l';
2169#endif
2170 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002171 else if (size_tflag) {
2172 char *f = PY_FORMAT_SIZE_T;
2173 while (*f)
2174 *fmt++ = *f++;
2175 }
2176 *fmt++ = c;
2177 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002178}
2179
Victor Stinner96865452011-03-01 23:44:09 +00002180/* helper for PyUnicode_FromFormatV() */
2181
2182static const char*
2183parse_format_flags(const char *f,
2184 int *p_width, int *p_precision,
2185 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2186{
2187 int width, precision, longflag, longlongflag, size_tflag;
2188
2189 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2190 f++;
2191 width = 0;
2192 while (Py_ISDIGIT((unsigned)*f))
2193 width = (width*10) + *f++ - '0';
2194 precision = 0;
2195 if (*f == '.') {
2196 f++;
2197 while (Py_ISDIGIT((unsigned)*f))
2198 precision = (precision*10) + *f++ - '0';
2199 if (*f == '%') {
2200 /* "%.3%s" => f points to "3" */
2201 f--;
2202 }
2203 }
2204 if (*f == '\0') {
2205 /* bogus format "%.1" => go backward, f points to "1" */
2206 f--;
2207 }
2208 if (p_width != NULL)
2209 *p_width = width;
2210 if (p_precision != NULL)
2211 *p_precision = precision;
2212
2213 /* Handle %ld, %lu, %lld and %llu. */
2214 longflag = 0;
2215 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002216 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002217
2218 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002219 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002220 longflag = 1;
2221 ++f;
2222 }
2223#ifdef HAVE_LONG_LONG
2224 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002225 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002226 longlongflag = 1;
2227 f += 2;
2228 }
2229#endif
2230 }
2231 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002232 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002233 size_tflag = 1;
2234 ++f;
2235 }
2236 if (p_longflag != NULL)
2237 *p_longflag = longflag;
2238 if (p_longlongflag != NULL)
2239 *p_longlongflag = longlongflag;
2240 if (p_size_tflag != NULL)
2241 *p_size_tflag = size_tflag;
2242 return f;
2243}
2244
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002245/* maximum number of characters required for output of %ld. 21 characters
2246 allows for 64-bit integers (in decimal) and an optional sign. */
2247#define MAX_LONG_CHARS 21
2248/* maximum number of characters required for output of %lld.
2249 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2250 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2251#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2252
Walter Dörwaldd2034312007-05-18 16:29:38 +00002253PyObject *
2254PyUnicode_FromFormatV(const char *format, va_list vargs)
2255{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002256 va_list count;
2257 Py_ssize_t callcount = 0;
2258 PyObject **callresults = NULL;
2259 PyObject **callresult = NULL;
2260 Py_ssize_t n = 0;
2261 int width = 0;
2262 int precision = 0;
2263 int zeropad;
2264 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002265 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002267 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2269 Py_UCS4 argmaxchar;
2270 Py_ssize_t numbersize = 0;
2271 char *numberresults = NULL;
2272 char *numberresult = NULL;
2273 Py_ssize_t i;
2274 int kind;
2275 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002277 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002278 /* step 1: count the number of %S/%R/%A/%s format specifications
2279 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2280 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002282 * also estimate a upper bound for all the number formats in the string,
2283 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 for (f = format; *f; f++) {
2286 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002287 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2289 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2290 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2291 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002294#ifdef HAVE_LONG_LONG
2295 if (longlongflag) {
2296 if (width < MAX_LONG_LONG_CHARS)
2297 width = MAX_LONG_LONG_CHARS;
2298 }
2299 else
2300#endif
2301 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2302 including sign. Decimal takes the most space. This
2303 isn't enough for octal. If a width is specified we
2304 need more (which we allocate later). */
2305 if (width < MAX_LONG_CHARS)
2306 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002307
2308 /* account for the size + '\0' to separate numbers
2309 inside of the numberresults buffer */
2310 numbersize += (width + 1);
2311 }
2312 }
2313 else if ((unsigned char)*f > 127) {
2314 PyErr_Format(PyExc_ValueError,
2315 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2316 "string, got a non-ASCII byte: 0x%02x",
2317 (unsigned char)*f);
2318 return NULL;
2319 }
2320 }
2321 /* step 2: allocate memory for the results of
2322 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2323 if (callcount) {
2324 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2325 if (!callresults) {
2326 PyErr_NoMemory();
2327 return NULL;
2328 }
2329 callresult = callresults;
2330 }
2331 /* step 2.5: allocate memory for the results of formating numbers */
2332 if (numbersize) {
2333 numberresults = PyObject_Malloc(numbersize);
2334 if (!numberresults) {
2335 PyErr_NoMemory();
2336 goto fail;
2337 }
2338 numberresult = numberresults;
2339 }
2340
2341 /* step 3: format numbers and figure out how large a buffer we need */
2342 for (f = format; *f; f++) {
2343 if (*f == '%') {
2344 const char* p;
2345 int longflag;
2346 int longlongflag;
2347 int size_tflag;
2348 int numprinted;
2349
2350 p = f;
2351 zeropad = (f[1] == '0');
2352 f = parse_format_flags(f, &width, &precision,
2353 &longflag, &longlongflag, &size_tflag);
2354 switch (*f) {
2355 case 'c':
2356 {
2357 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002358 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359 n++;
2360 break;
2361 }
2362 case '%':
2363 n++;
2364 break;
2365 case 'i':
2366 case 'd':
2367 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2368 width, precision, *f);
2369 if (longflag)
2370 numprinted = sprintf(numberresult, fmt,
2371 va_arg(count, long));
2372#ifdef HAVE_LONG_LONG
2373 else if (longlongflag)
2374 numprinted = sprintf(numberresult, fmt,
2375 va_arg(count, PY_LONG_LONG));
2376#endif
2377 else if (size_tflag)
2378 numprinted = sprintf(numberresult, fmt,
2379 va_arg(count, Py_ssize_t));
2380 else
2381 numprinted = sprintf(numberresult, fmt,
2382 va_arg(count, int));
2383 n += numprinted;
2384 /* advance by +1 to skip over the '\0' */
2385 numberresult += (numprinted + 1);
2386 assert(*(numberresult - 1) == '\0');
2387 assert(*(numberresult - 2) != '\0');
2388 assert(numprinted >= 0);
2389 assert(numberresult <= numberresults + numbersize);
2390 break;
2391 case 'u':
2392 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2393 width, precision, 'u');
2394 if (longflag)
2395 numprinted = sprintf(numberresult, fmt,
2396 va_arg(count, unsigned long));
2397#ifdef HAVE_LONG_LONG
2398 else if (longlongflag)
2399 numprinted = sprintf(numberresult, fmt,
2400 va_arg(count, unsigned PY_LONG_LONG));
2401#endif
2402 else if (size_tflag)
2403 numprinted = sprintf(numberresult, fmt,
2404 va_arg(count, size_t));
2405 else
2406 numprinted = sprintf(numberresult, fmt,
2407 va_arg(count, unsigned int));
2408 n += numprinted;
2409 numberresult += (numprinted + 1);
2410 assert(*(numberresult - 1) == '\0');
2411 assert(*(numberresult - 2) != '\0');
2412 assert(numprinted >= 0);
2413 assert(numberresult <= numberresults + numbersize);
2414 break;
2415 case 'x':
2416 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2417 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2418 n += numprinted;
2419 numberresult += (numprinted + 1);
2420 assert(*(numberresult - 1) == '\0');
2421 assert(*(numberresult - 2) != '\0');
2422 assert(numprinted >= 0);
2423 assert(numberresult <= numberresults + numbersize);
2424 break;
2425 case 'p':
2426 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2427 /* %p is ill-defined: ensure leading 0x. */
2428 if (numberresult[1] == 'X')
2429 numberresult[1] = 'x';
2430 else if (numberresult[1] != 'x') {
2431 memmove(numberresult + 2, numberresult,
2432 strlen(numberresult) + 1);
2433 numberresult[0] = '0';
2434 numberresult[1] = 'x';
2435 numprinted += 2;
2436 }
2437 n += numprinted;
2438 numberresult += (numprinted + 1);
2439 assert(*(numberresult - 1) == '\0');
2440 assert(*(numberresult - 2) != '\0');
2441 assert(numprinted >= 0);
2442 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002443 break;
2444 case 's':
2445 {
2446 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002447 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002448 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2449 if (!str)
2450 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 /* since PyUnicode_DecodeUTF8 returns already flexible
2452 unicode objects, there is no need to call ready on them */
2453 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002454 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002456 /* Remember the str and switch to the next slot */
2457 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002458 break;
2459 }
2460 case 'U':
2461 {
2462 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002463 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 if (PyUnicode_READY(obj) == -1)
2465 goto fail;
2466 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002467 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002469 break;
2470 }
2471 case 'V':
2472 {
2473 PyObject *obj = va_arg(count, PyObject *);
2474 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002475 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002476 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002477 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002478 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 if (PyUnicode_READY(obj) == -1)
2480 goto fail;
2481 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002482 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002484 *callresult++ = NULL;
2485 }
2486 else {
2487 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2488 if (!str_obj)
2489 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002490 if (PyUnicode_READY(str_obj)) {
2491 Py_DECREF(str_obj);
2492 goto fail;
2493 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002495 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002497 *callresult++ = str_obj;
2498 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002499 break;
2500 }
2501 case 'S':
2502 {
2503 PyObject *obj = va_arg(count, PyObject *);
2504 PyObject *str;
2505 assert(obj);
2506 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002508 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002509 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002510 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002512 /* Remember the str and switch to the next slot */
2513 *callresult++ = str;
2514 break;
2515 }
2516 case 'R':
2517 {
2518 PyObject *obj = va_arg(count, PyObject *);
2519 PyObject *repr;
2520 assert(obj);
2521 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002522 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002523 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002524 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002525 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002527 /* Remember the repr and switch to the next slot */
2528 *callresult++ = repr;
2529 break;
2530 }
2531 case 'A':
2532 {
2533 PyObject *obj = va_arg(count, PyObject *);
2534 PyObject *ascii;
2535 assert(obj);
2536 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002540 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002541 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 /* Remember the repr and switch to the next slot */
2543 *callresult++ = ascii;
2544 break;
2545 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 default:
2547 /* if we stumble upon an unknown
2548 formatting code, copy the rest of
2549 the format string to the output
2550 string. (we cannot just skip the
2551 code, since there's no way to know
2552 what's in the argument list) */
2553 n += strlen(p);
2554 goto expand;
2555 }
2556 } else
2557 n++;
2558 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002559 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 we don't have to resize the string.
2563 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002564 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 if (!string)
2566 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 kind = PyUnicode_KIND(string);
2568 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002572 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002573 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002574 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002575
2576 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2578 /* checking for == because the last argument could be a empty
2579 string, which causes i to point to end, the assert at the end of
2580 the loop */
2581 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002582
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 switch (*f) {
2584 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002585 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586 const int ordinal = va_arg(vargs, int);
2587 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002589 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002590 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002592 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002593 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594 case 'p':
2595 /* unused, since we already have the result */
2596 if (*f == 'p')
2597 (void) va_arg(vargs, void *);
2598 else
2599 (void) va_arg(vargs, int);
2600 /* extract the result from numberresults and append. */
2601 for (; *numberresult; ++i, ++numberresult)
2602 PyUnicode_WRITE(kind, data, i, *numberresult);
2603 /* skip over the separating '\0' */
2604 assert(*numberresult == '\0');
2605 numberresult++;
2606 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002607 break;
2608 case 's':
2609 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002610 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002612 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 size = PyUnicode_GET_LENGTH(*callresult);
2614 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002615 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002617 /* We're done with the unicode()/repr() => forget it */
2618 Py_DECREF(*callresult);
2619 /* switch to next unicode()/repr() result */
2620 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 break;
2622 }
2623 case 'U':
2624 {
2625 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 Py_ssize_t size;
2627 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2628 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002629 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 break;
2632 }
2633 case 'V':
2634 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002637 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 size = PyUnicode_GET_LENGTH(obj);
2640 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002641 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 size = PyUnicode_GET_LENGTH(*callresult);
2645 assert(PyUnicode_KIND(*callresult) <=
2646 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002647 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002648 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002649 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002651 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 break;
2653 }
2654 case 'S':
2655 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002656 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002658 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 /* unused, since we already have the result */
2660 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002661 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002662 copy_characters(string, i, *callresult, 0, size);
2663 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 /* We're done with the unicode()/repr() => forget it */
2665 Py_DECREF(*callresult);
2666 /* switch to next unicode()/repr() result */
2667 ++callresult;
2668 break;
2669 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002671 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 break;
2673 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 for (; *p; ++p, ++i)
2675 PyUnicode_WRITE(kind, data, i, *p);
2676 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 goto end;
2678 }
Victor Stinner1205f272010-09-11 00:54:47 +00002679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 else {
2681 assert(i < PyUnicode_GET_LENGTH(string));
2682 PyUnicode_WRITE(kind, data, i++, *f);
2683 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002686
Benjamin Peterson29060642009-01-31 22:14:21 +00002687 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 if (callresults)
2689 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 if (numberresults)
2691 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002692 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002693 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 if (callresults) {
2695 PyObject **callresult2 = callresults;
2696 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002697 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 ++callresult2;
2699 }
2700 PyObject_Free(callresults);
2701 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 if (numberresults)
2703 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002704 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705}
2706
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707PyObject *
2708PyUnicode_FromFormat(const char *format, ...)
2709{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 PyObject* ret;
2711 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002712
2713#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002715#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002718 ret = PyUnicode_FromFormatV(format, vargs);
2719 va_end(vargs);
2720 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002721}
2722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723#ifdef HAVE_WCHAR_H
2724
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2726 convert a Unicode object to a wide character string.
2727
Victor Stinnerd88d9832011-09-06 02:00:05 +02002728 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002729 character) required to convert the unicode object. Ignore size argument.
2730
Victor Stinnerd88d9832011-09-06 02:00:05 +02002731 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002732 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002733 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002734static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002735unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002736 wchar_t *w,
2737 Py_ssize_t size)
2738{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002739 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 const wchar_t *wstr;
2741
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002742 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 if (wstr == NULL)
2744 return -1;
2745
Victor Stinner5593d8a2010-10-02 11:11:27 +00002746 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002747 if (size > res)
2748 size = res + 1;
2749 else
2750 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002752 return res;
2753 }
2754 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002756}
2757
2758Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002759PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002760 wchar_t *w,
2761 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762{
2763 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 PyErr_BadInternalCall();
2765 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002767 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768}
2769
Victor Stinner137c34c2010-09-29 10:25:54 +00002770wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002771PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002772 Py_ssize_t *size)
2773{
2774 wchar_t* buffer;
2775 Py_ssize_t buflen;
2776
2777 if (unicode == NULL) {
2778 PyErr_BadInternalCall();
2779 return NULL;
2780 }
2781
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002782 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 if (buflen == -1)
2784 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002786 PyErr_NoMemory();
2787 return NULL;
2788 }
2789
Victor Stinner137c34c2010-09-29 10:25:54 +00002790 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2791 if (buffer == NULL) {
2792 PyErr_NoMemory();
2793 return NULL;
2794 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002795 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002796 if (buflen == -1)
2797 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002798 if (size != NULL)
2799 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002800 return buffer;
2801}
2802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002803#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804
Alexander Belopolsky40018472011-02-26 01:02:56 +00002805PyObject *
2806PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002808 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002809 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002810 PyErr_SetString(PyExc_ValueError,
2811 "chr() arg not in range(0x110000)");
2812 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002813 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815 if (ordinal < 256)
2816 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 v = PyUnicode_New(1, ordinal);
2819 if (v == NULL)
2820 return NULL;
2821 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002822 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002823 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002824}
2825
Alexander Belopolsky40018472011-02-26 01:02:56 +00002826PyObject *
2827PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002829 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002830 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002831 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002832 if (PyUnicode_READY(obj))
2833 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 Py_INCREF(obj);
2835 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002836 }
2837 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 /* For a Unicode subtype that's not a Unicode object,
2839 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002840 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002841 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002842 PyErr_Format(PyExc_TypeError,
2843 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002844 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002845 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002846}
2847
Alexander Belopolsky40018472011-02-26 01:02:56 +00002848PyObject *
2849PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002850 const char *encoding,
2851 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002852{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002853 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002854 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002855
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002857 PyErr_BadInternalCall();
2858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002860
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002861 /* Decoding bytes objects is the most common case and should be fast */
2862 if (PyBytes_Check(obj)) {
2863 if (PyBytes_GET_SIZE(obj) == 0) {
2864 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002865 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002866 }
2867 else {
2868 v = PyUnicode_Decode(
2869 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2870 encoding, errors);
2871 }
2872 return v;
2873 }
2874
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002875 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002876 PyErr_SetString(PyExc_TypeError,
2877 "decoding str is not supported");
2878 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002879 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002880
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002881 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2882 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2883 PyErr_Format(PyExc_TypeError,
2884 "coercing to str: need bytes, bytearray "
2885 "or buffer-like object, %.80s found",
2886 Py_TYPE(obj)->tp_name);
2887 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002888 }
Tim Petersced69f82003-09-16 20:30:58 +00002889
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002890 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002892 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 }
Tim Petersced69f82003-09-16 20:30:58 +00002894 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002895 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002896
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002897 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002898 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899}
2900
Victor Stinner600d3be2010-06-10 12:00:55 +00002901/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002902 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2903 1 on success. */
2904static int
2905normalize_encoding(const char *encoding,
2906 char *lower,
2907 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002909 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002910 char *l;
2911 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002912
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002913 if (encoding == NULL) {
2914 strcpy(lower, "utf-8");
2915 return 1;
2916 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002917 e = encoding;
2918 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002919 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002920 while (*e) {
2921 if (l == l_end)
2922 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002923 if (Py_ISUPPER(*e)) {
2924 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002925 }
2926 else if (*e == '_') {
2927 *l++ = '-';
2928 e++;
2929 }
2930 else {
2931 *l++ = *e++;
2932 }
2933 }
2934 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002935 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002936}
2937
Alexander Belopolsky40018472011-02-26 01:02:56 +00002938PyObject *
2939PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002940 Py_ssize_t size,
2941 const char *encoding,
2942 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002943{
2944 PyObject *buffer = NULL, *unicode;
2945 Py_buffer info;
2946 char lower[11]; /* Enough for any encoding shortcut */
2947
Fred Drakee4315f52000-05-09 19:53:39 +00002948 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002949 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002950 if ((strcmp(lower, "utf-8") == 0) ||
2951 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002952 return PyUnicode_DecodeUTF8(s, size, errors);
2953 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002954 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002955 (strcmp(lower, "iso-8859-1") == 0))
2956 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002957#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002958 else if (strcmp(lower, "mbcs") == 0)
2959 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002960#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002961 else if (strcmp(lower, "ascii") == 0)
2962 return PyUnicode_DecodeASCII(s, size, errors);
2963 else if (strcmp(lower, "utf-16") == 0)
2964 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2965 else if (strcmp(lower, "utf-32") == 0)
2966 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968
2969 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002970 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002971 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002972 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002973 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 if (buffer == NULL)
2975 goto onError;
2976 unicode = PyCodec_Decode(buffer, encoding, errors);
2977 if (unicode == NULL)
2978 goto onError;
2979 if (!PyUnicode_Check(unicode)) {
2980 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002981 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002982 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 Py_DECREF(unicode);
2984 goto onError;
2985 }
2986 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002987 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002988
Benjamin Peterson29060642009-01-31 22:14:21 +00002989 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 Py_XDECREF(buffer);
2991 return NULL;
2992}
2993
Alexander Belopolsky40018472011-02-26 01:02:56 +00002994PyObject *
2995PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002996 const char *encoding,
2997 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998{
2999 PyObject *v;
3000
3001 if (!PyUnicode_Check(unicode)) {
3002 PyErr_BadArgument();
3003 goto onError;
3004 }
3005
3006 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003008
3009 /* Decode via the codec registry */
3010 v = PyCodec_Decode(unicode, encoding, errors);
3011 if (v == NULL)
3012 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003013 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003014
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003016 return NULL;
3017}
3018
Alexander Belopolsky40018472011-02-26 01:02:56 +00003019PyObject *
3020PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003021 const char *encoding,
3022 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003023{
3024 PyObject *v;
3025
3026 if (!PyUnicode_Check(unicode)) {
3027 PyErr_BadArgument();
3028 goto onError;
3029 }
3030
3031 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003033
3034 /* Decode via the codec registry */
3035 v = PyCodec_Decode(unicode, encoding, errors);
3036 if (v == NULL)
3037 goto onError;
3038 if (!PyUnicode_Check(v)) {
3039 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003040 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003041 Py_TYPE(v)->tp_name);
3042 Py_DECREF(v);
3043 goto onError;
3044 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003045 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003046
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003048 return NULL;
3049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
3052PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003053 Py_ssize_t size,
3054 const char *encoding,
3055 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056{
3057 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003058
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 unicode = PyUnicode_FromUnicode(s, size);
3060 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3063 Py_DECREF(unicode);
3064 return v;
3065}
3066
Alexander Belopolsky40018472011-02-26 01:02:56 +00003067PyObject *
3068PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003069 const char *encoding,
3070 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003071{
3072 PyObject *v;
3073
3074 if (!PyUnicode_Check(unicode)) {
3075 PyErr_BadArgument();
3076 goto onError;
3077 }
3078
3079 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003081
3082 /* Encode via the codec registry */
3083 v = PyCodec_Encode(unicode, encoding, errors);
3084 if (v == NULL)
3085 goto onError;
3086 return v;
3087
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003089 return NULL;
3090}
3091
Victor Stinnerad158722010-10-27 00:25:46 +00003092PyObject *
3093PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003094{
Victor Stinner99b95382011-07-04 14:23:54 +02003095#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003096 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003097#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003098 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003099#else
Victor Stinner793b5312011-04-27 00:24:21 +02003100 PyInterpreterState *interp = PyThreadState_GET()->interp;
3101 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3102 cannot use it to encode and decode filenames before it is loaded. Load
3103 the Python codec requires to encode at least its own filename. Use the C
3104 version of the locale codec until the codec registry is initialized and
3105 the Python codec is loaded.
3106
3107 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3108 cannot only rely on it: check also interp->fscodec_initialized for
3109 subinterpreters. */
3110 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003111 return PyUnicode_AsEncodedString(unicode,
3112 Py_FileSystemDefaultEncoding,
3113 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003114 }
3115 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003116 /* locale encoding with surrogateescape */
3117 wchar_t *wchar;
3118 char *bytes;
3119 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003120 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003121
3122 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3123 if (wchar == NULL)
3124 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003125 bytes = _Py_wchar2char(wchar, &error_pos);
3126 if (bytes == NULL) {
3127 if (error_pos != (size_t)-1) {
3128 char *errmsg = strerror(errno);
3129 PyObject *exc = NULL;
3130 if (errmsg == NULL)
3131 errmsg = "Py_wchar2char() failed";
3132 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003133 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003134 error_pos, error_pos+1,
3135 errmsg);
3136 Py_XDECREF(exc);
3137 }
3138 else
3139 PyErr_NoMemory();
3140 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003141 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003142 }
3143 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003144
3145 bytes_obj = PyBytes_FromString(bytes);
3146 PyMem_Free(bytes);
3147 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003148 }
Victor Stinnerad158722010-10-27 00:25:46 +00003149#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003150}
3151
Alexander Belopolsky40018472011-02-26 01:02:56 +00003152PyObject *
3153PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003154 const char *encoding,
3155 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156{
3157 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003158 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003159
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 if (!PyUnicode_Check(unicode)) {
3161 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 }
Fred Drakee4315f52000-05-09 19:53:39 +00003164
Fred Drakee4315f52000-05-09 19:53:39 +00003165 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003166 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003167 if ((strcmp(lower, "utf-8") == 0) ||
3168 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003169 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003170 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003171 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003172 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003173 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003174 }
Victor Stinner37296e82010-06-10 13:36:23 +00003175 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003176 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003177 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003178 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003179#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003180 else if (strcmp(lower, "mbcs") == 0)
3181 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003182#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003183 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003184 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186
3187 /* Encode via the codec registry */
3188 v = PyCodec_Encode(unicode, encoding, errors);
3189 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003190 return NULL;
3191
3192 /* The normal path */
3193 if (PyBytes_Check(v))
3194 return v;
3195
3196 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003197 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003198 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003199 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003200
3201 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3202 "encoder %s returned bytearray instead of bytes",
3203 encoding);
3204 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003205 Py_DECREF(v);
3206 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003207 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003208
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003209 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3210 Py_DECREF(v);
3211 return b;
3212 }
3213
3214 PyErr_Format(PyExc_TypeError,
3215 "encoder did not return a bytes object (type=%.400s)",
3216 Py_TYPE(v)->tp_name);
3217 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003218 return NULL;
3219}
3220
Alexander Belopolsky40018472011-02-26 01:02:56 +00003221PyObject *
3222PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003223 const char *encoding,
3224 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003225{
3226 PyObject *v;
3227
3228 if (!PyUnicode_Check(unicode)) {
3229 PyErr_BadArgument();
3230 goto onError;
3231 }
3232
3233 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003235
3236 /* Encode via the codec registry */
3237 v = PyCodec_Encode(unicode, encoding, errors);
3238 if (v == NULL)
3239 goto onError;
3240 if (!PyUnicode_Check(v)) {
3241 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003242 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003243 Py_TYPE(v)->tp_name);
3244 Py_DECREF(v);
3245 goto onError;
3246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003248
Benjamin Peterson29060642009-01-31 22:14:21 +00003249 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 return NULL;
3251}
3252
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003253PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003254PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003255 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003256 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3257}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003258
Christian Heimes5894ba72007-11-04 11:43:14 +00003259PyObject*
3260PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3261{
Victor Stinner99b95382011-07-04 14:23:54 +02003262#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003263 return PyUnicode_DecodeMBCS(s, size, NULL);
3264#elif defined(__APPLE__)
3265 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3266#else
Victor Stinner793b5312011-04-27 00:24:21 +02003267 PyInterpreterState *interp = PyThreadState_GET()->interp;
3268 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3269 cannot use it to encode and decode filenames before it is loaded. Load
3270 the Python codec requires to encode at least its own filename. Use the C
3271 version of the locale codec until the codec registry is initialized and
3272 the Python codec is loaded.
3273
3274 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3275 cannot only rely on it: check also interp->fscodec_initialized for
3276 subinterpreters. */
3277 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003278 return PyUnicode_Decode(s, size,
3279 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003280 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003281 }
3282 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003283 /* locale encoding with surrogateescape */
3284 wchar_t *wchar;
3285 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003286 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003287
3288 if (s[size] != '\0' || size != strlen(s)) {
3289 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3290 return NULL;
3291 }
3292
Victor Stinner168e1172010-10-16 23:16:16 +00003293 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003294 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003295 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003296
Victor Stinner168e1172010-10-16 23:16:16 +00003297 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003298 PyMem_Free(wchar);
3299 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003300 }
Victor Stinnerad158722010-10-27 00:25:46 +00003301#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003302}
3303
Martin v. Löwis011e8422009-05-05 04:43:17 +00003304
3305int
3306PyUnicode_FSConverter(PyObject* arg, void* addr)
3307{
3308 PyObject *output = NULL;
3309 Py_ssize_t size;
3310 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003311 if (arg == NULL) {
3312 Py_DECREF(*(PyObject**)addr);
3313 return 1;
3314 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003315 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003316 output = arg;
3317 Py_INCREF(output);
3318 }
3319 else {
3320 arg = PyUnicode_FromObject(arg);
3321 if (!arg)
3322 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003323 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003324 Py_DECREF(arg);
3325 if (!output)
3326 return 0;
3327 if (!PyBytes_Check(output)) {
3328 Py_DECREF(output);
3329 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3330 return 0;
3331 }
3332 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003333 size = PyBytes_GET_SIZE(output);
3334 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003335 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003336 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003337 Py_DECREF(output);
3338 return 0;
3339 }
3340 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003341 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003342}
3343
3344
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003345int
3346PyUnicode_FSDecoder(PyObject* arg, void* addr)
3347{
3348 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003349 if (arg == NULL) {
3350 Py_DECREF(*(PyObject**)addr);
3351 return 1;
3352 }
3353 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003354 if (PyUnicode_READY(arg))
3355 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003356 output = arg;
3357 Py_INCREF(output);
3358 }
3359 else {
3360 arg = PyBytes_FromObject(arg);
3361 if (!arg)
3362 return 0;
3363 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3364 PyBytes_GET_SIZE(arg));
3365 Py_DECREF(arg);
3366 if (!output)
3367 return 0;
3368 if (!PyUnicode_Check(output)) {
3369 Py_DECREF(output);
3370 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3371 return 0;
3372 }
3373 }
Victor Stinner065836e2011-10-27 01:56:33 +02003374 if (PyUnicode_READY(output) < 0) {
3375 Py_DECREF(output);
3376 return 0;
3377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003378 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003379 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003380 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3381 Py_DECREF(output);
3382 return 0;
3383 }
3384 *(PyObject**)addr = output;
3385 return Py_CLEANUP_SUPPORTED;
3386}
3387
3388
Martin v. Löwis5b222132007-06-10 09:51:05 +00003389char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003390PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003391{
Christian Heimesf3863112007-11-22 07:46:41 +00003392 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003393
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003394 if (!PyUnicode_Check(unicode)) {
3395 PyErr_BadArgument();
3396 return NULL;
3397 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003398 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003399 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003400
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003401 if (PyUnicode_UTF8(unicode) == NULL) {
3402 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003403 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3404 if (bytes == NULL)
3405 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003406 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3407 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408 Py_DECREF(bytes);
3409 return NULL;
3410 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003411 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3412 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3413 PyBytes_AS_STRING(bytes),
3414 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415 Py_DECREF(bytes);
3416 }
3417
3418 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003419 *psize = PyUnicode_UTF8_LENGTH(unicode);
3420 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003421}
3422
3423char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003424PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3427}
3428
3429#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003430static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003431#endif
3432
3433
3434Py_UNICODE *
3435PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3436{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003437 const unsigned char *one_byte;
3438#if SIZEOF_WCHAR_T == 4
3439 const Py_UCS2 *two_bytes;
3440#else
3441 const Py_UCS4 *four_bytes;
3442 const Py_UCS4 *ucs4_end;
3443 Py_ssize_t num_surrogates;
3444#endif
3445 wchar_t *w;
3446 wchar_t *wchar_end;
3447
3448 if (!PyUnicode_Check(unicode)) {
3449 PyErr_BadArgument();
3450 return NULL;
3451 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003452 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003453 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003454 assert(_PyUnicode_KIND(unicode) != 0);
3455 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003456
3457#ifdef Py_DEBUG
3458 ++unicode_as_unicode_calls;
3459#endif
3460
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003461 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003462#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003463 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3464 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003465 num_surrogates = 0;
3466
3467 for (; four_bytes < ucs4_end; ++four_bytes) {
3468 if (*four_bytes > 0xFFFF)
3469 ++num_surrogates;
3470 }
3471
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003472 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3473 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3474 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003475 PyErr_NoMemory();
3476 return NULL;
3477 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003478 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003480 w = _PyUnicode_WSTR(unicode);
3481 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3482 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003483 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3484 if (*four_bytes > 0xFFFF) {
3485 /* encode surrogate pair in this case */
3486 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3487 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3488 }
3489 else
3490 *w = *four_bytes;
3491
3492 if (w > wchar_end) {
3493 assert(0 && "Miscalculated string end");
3494 }
3495 }
3496 *w = 0;
3497#else
3498 /* sizeof(wchar_t) == 4 */
3499 Py_FatalError("Impossible unicode object state, wstr and str "
3500 "should share memory already.");
3501 return NULL;
3502#endif
3503 }
3504 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003505 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3506 (_PyUnicode_LENGTH(unicode) + 1));
3507 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003508 PyErr_NoMemory();
3509 return NULL;
3510 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003511 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3512 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3513 w = _PyUnicode_WSTR(unicode);
3514 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003515
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003516 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3517 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003518 for (; w < wchar_end; ++one_byte, ++w)
3519 *w = *one_byte;
3520 /* null-terminate the wstr */
3521 *w = 0;
3522 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003523 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003524#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003525 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003526 for (; w < wchar_end; ++two_bytes, ++w)
3527 *w = *two_bytes;
3528 /* null-terminate the wstr */
3529 *w = 0;
3530#else
3531 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003532 PyObject_FREE(_PyUnicode_WSTR(unicode));
3533 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534 Py_FatalError("Impossible unicode object state, wstr "
3535 "and str should share memory already.");
3536 return NULL;
3537#endif
3538 }
3539 else {
3540 assert(0 && "This should never happen.");
3541 }
3542 }
3543 }
3544 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003545 *size = PyUnicode_WSTR_LENGTH(unicode);
3546 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003547}
3548
Alexander Belopolsky40018472011-02-26 01:02:56 +00003549Py_UNICODE *
3550PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553}
3554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003555
Alexander Belopolsky40018472011-02-26 01:02:56 +00003556Py_ssize_t
3557PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558{
3559 if (!PyUnicode_Check(unicode)) {
3560 PyErr_BadArgument();
3561 goto onError;
3562 }
3563 return PyUnicode_GET_SIZE(unicode);
3564
Benjamin Peterson29060642009-01-31 22:14:21 +00003565 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 return -1;
3567}
3568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003569Py_ssize_t
3570PyUnicode_GetLength(PyObject *unicode)
3571{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003572 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003573 PyErr_BadArgument();
3574 return -1;
3575 }
3576
3577 return PyUnicode_GET_LENGTH(unicode);
3578}
3579
3580Py_UCS4
3581PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3582{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003583 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3584 PyErr_BadArgument();
3585 return (Py_UCS4)-1;
3586 }
3587 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3588 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003589 return (Py_UCS4)-1;
3590 }
3591 return PyUnicode_READ_CHAR(unicode, index);
3592}
3593
3594int
3595PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3596{
3597 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003598 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003599 return -1;
3600 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003601 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3602 PyErr_SetString(PyExc_IndexError, "string index out of range");
3603 return -1;
3604 }
3605 if (_PyUnicode_Dirty(unicode))
3606 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003607 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3608 index, ch);
3609 return 0;
3610}
3611
Alexander Belopolsky40018472011-02-26 01:02:56 +00003612const char *
3613PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003614{
Victor Stinner42cb4622010-09-01 19:39:01 +00003615 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003616}
3617
Victor Stinner554f3f02010-06-16 23:33:54 +00003618/* create or adjust a UnicodeDecodeError */
3619static void
3620make_decode_exception(PyObject **exceptionObject,
3621 const char *encoding,
3622 const char *input, Py_ssize_t length,
3623 Py_ssize_t startpos, Py_ssize_t endpos,
3624 const char *reason)
3625{
3626 if (*exceptionObject == NULL) {
3627 *exceptionObject = PyUnicodeDecodeError_Create(
3628 encoding, input, length, startpos, endpos, reason);
3629 }
3630 else {
3631 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3632 goto onError;
3633 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3634 goto onError;
3635 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3636 goto onError;
3637 }
3638 return;
3639
3640onError:
3641 Py_DECREF(*exceptionObject);
3642 *exceptionObject = NULL;
3643}
3644
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645/* error handling callback helper:
3646 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003647 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 and adjust various state variables.
3649 return 0 on success, -1 on error
3650*/
3651
Alexander Belopolsky40018472011-02-26 01:02:56 +00003652static int
3653unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003654 const char *encoding, const char *reason,
3655 const char **input, const char **inend, Py_ssize_t *startinpos,
3656 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003657 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003659 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660
3661 PyObject *restuple = NULL;
3662 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003663 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003664 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003665 Py_ssize_t requiredsize;
3666 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003667 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003668 int res = -1;
3669
Victor Stinner596a6c42011-11-09 00:02:18 +01003670 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3671 outsize = PyUnicode_GET_LENGTH(*output);
3672 else
3673 outsize = _PyUnicode_WSTR_LENGTH(*output);
3674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 *errorHandler = PyCodec_LookupError(errors);
3677 if (*errorHandler == NULL)
3678 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 }
3680
Victor Stinner554f3f02010-06-16 23:33:54 +00003681 make_decode_exception(exceptionObject,
3682 encoding,
3683 *input, *inend - *input,
3684 *startinpos, *endinpos,
3685 reason);
3686 if (*exceptionObject == NULL)
3687 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688
3689 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3690 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003693 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 }
3696 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003698 if (PyUnicode_READY(repunicode) < 0)
3699 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003700
3701 /* Copy back the bytes variables, which might have been modified by the
3702 callback */
3703 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3704 if (!inputobj)
3705 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003706 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003708 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003709 *input = PyBytes_AS_STRING(inputobj);
3710 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003711 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003712 /* we can DECREF safely, as the exception has another reference,
3713 so the object won't go away. */
3714 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003717 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003718 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003719 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3720 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003721 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722
Victor Stinner596a6c42011-11-09 00:02:18 +01003723 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3724 /* need more space? (at least enough for what we
3725 have+the replacement+the rest of the string (starting
3726 at the new input position), so we won't have to check space
3727 when there are no errors in the rest of the string) */
3728 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3729 requiredsize = *outpos + replen + insize-newpos;
3730 if (requiredsize > outsize) {
3731 if (requiredsize<2*outsize)
3732 requiredsize = 2*outsize;
3733 if (unicode_resize(output, requiredsize) < 0)
3734 goto onError;
3735 }
3736 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003737 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003738 copy_characters(*output, *outpos, repunicode, 0, replen);
3739 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003741 else {
3742 wchar_t *repwstr;
3743 Py_ssize_t repwlen;
3744 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3745 if (repwstr == NULL)
3746 goto onError;
3747 /* need more space? (at least enough for what we
3748 have+the replacement+the rest of the string (starting
3749 at the new input position), so we won't have to check space
3750 when there are no errors in the rest of the string) */
3751 requiredsize = *outpos + repwlen + insize-newpos;
3752 if (requiredsize > outsize) {
3753 if (requiredsize < 2*outsize)
3754 requiredsize = 2*outsize;
3755 if (unicode_resize(output, requiredsize) < 0)
3756 goto onError;
3757 }
3758 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3759 *outpos += repwlen;
3760 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003762 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003763
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 /* we made it! */
3765 res = 0;
3766
Benjamin Peterson29060642009-01-31 22:14:21 +00003767 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 Py_XDECREF(restuple);
3769 return res;
3770}
3771
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003772/* --- UTF-7 Codec -------------------------------------------------------- */
3773
Antoine Pitrou244651a2009-05-04 18:56:13 +00003774/* See RFC2152 for details. We encode conservatively and decode liberally. */
3775
3776/* Three simple macros defining base-64. */
3777
3778/* Is c a base-64 character? */
3779
3780#define IS_BASE64(c) \
3781 (((c) >= 'A' && (c) <= 'Z') || \
3782 ((c) >= 'a' && (c) <= 'z') || \
3783 ((c) >= '0' && (c) <= '9') || \
3784 (c) == '+' || (c) == '/')
3785
3786/* given that c is a base-64 character, what is its base-64 value? */
3787
3788#define FROM_BASE64(c) \
3789 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3790 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3791 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3792 (c) == '+' ? 62 : 63)
3793
3794/* What is the base-64 character of the bottom 6 bits of n? */
3795
3796#define TO_BASE64(n) \
3797 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3798
3799/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3800 * decoded as itself. We are permissive on decoding; the only ASCII
3801 * byte not decoding to itself is the + which begins a base64
3802 * string. */
3803
3804#define DECODE_DIRECT(c) \
3805 ((c) <= 127 && (c) != '+')
3806
3807/* The UTF-7 encoder treats ASCII characters differently according to
3808 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3809 * the above). See RFC2152. This array identifies these different
3810 * sets:
3811 * 0 : "Set D"
3812 * alphanumeric and '(),-./:?
3813 * 1 : "Set O"
3814 * !"#$%&*;<=>@[]^_`{|}
3815 * 2 : "whitespace"
3816 * ht nl cr sp
3817 * 3 : special (must be base64 encoded)
3818 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3819 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003820
Tim Petersced69f82003-09-16 20:30:58 +00003821static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003822char utf7_category[128] = {
3823/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3824 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3825/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3826 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3827/* sp ! " # $ % & ' ( ) * + , - . / */
3828 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3829/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3831/* @ A B C D E F G H I J K L M N O */
3832 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3833/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3835/* ` a b c d e f g h i j k l m n o */
3836 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3837/* p q r s t u v w x y z { | } ~ del */
3838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003839};
3840
Antoine Pitrou244651a2009-05-04 18:56:13 +00003841/* ENCODE_DIRECT: this character should be encoded as itself. The
3842 * answer depends on whether we are encoding set O as itself, and also
3843 * on whether we are encoding whitespace as itself. RFC2152 makes it
3844 * clear that the answers to these questions vary between
3845 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003846
Antoine Pitrou244651a2009-05-04 18:56:13 +00003847#define ENCODE_DIRECT(c, directO, directWS) \
3848 ((c) < 128 && (c) > 0 && \
3849 ((utf7_category[(c)] == 0) || \
3850 (directWS && (utf7_category[(c)] == 2)) || \
3851 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852
Alexander Belopolsky40018472011-02-26 01:02:56 +00003853PyObject *
3854PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003855 Py_ssize_t size,
3856 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003857{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003858 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3859}
3860
Antoine Pitrou244651a2009-05-04 18:56:13 +00003861/* The decoder. The only state we preserve is our read position,
3862 * i.e. how many characters we have consumed. So if we end in the
3863 * middle of a shift sequence we have to back off the read position
3864 * and the output to the beginning of the sequence, otherwise we lose
3865 * all the shift state (seen bits, number of bits seen, high
3866 * surrogate). */
3867
Alexander Belopolsky40018472011-02-26 01:02:56 +00003868PyObject *
3869PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003870 Py_ssize_t size,
3871 const char *errors,
3872 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003873{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003874 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003875 Py_ssize_t startinpos;
3876 Py_ssize_t endinpos;
3877 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003878 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003879 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003880 const char *errmsg = "";
3881 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003882 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003883 unsigned int base64bits = 0;
3884 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003885 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886 PyObject *errorHandler = NULL;
3887 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003888
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003889 /* Start off assuming it's all ASCII. Widen later as necessary. */
3890 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003891 if (!unicode)
3892 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003893 if (size == 0) {
3894 if (consumed)
3895 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003896 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003897 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003898
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003899 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003900 e = s + size;
3901
3902 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003903 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003904 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003905 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003906
Antoine Pitrou244651a2009-05-04 18:56:13 +00003907 if (inShift) { /* in a base-64 section */
3908 if (IS_BASE64(ch)) { /* consume a base-64 character */
3909 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3910 base64bits += 6;
3911 s++;
3912 if (base64bits >= 16) {
3913 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003914 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915 base64bits -= 16;
3916 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3917 if (surrogate) {
3918 /* expecting a second surrogate */
3919 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003920 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3921 | (outCh & 0x3FF)) + 0x10000;
3922 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3923 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003925 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003926 }
3927 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003928 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3929 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003931 }
3932 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003933 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003934 /* first surrogate */
3935 surrogate = outCh;
3936 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003937 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003938 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3939 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003940 }
3941 }
3942 }
3943 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003944 inShift = 0;
3945 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003946 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003947 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3948 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003949 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003950 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951 if (base64bits > 0) { /* left-over bits */
3952 if (base64bits >= 6) {
3953 /* We've seen at least one base-64 character */
3954 errmsg = "partial character in shift sequence";
3955 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003956 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003957 else {
3958 /* Some bits remain; they should be zero */
3959 if (base64buffer != 0) {
3960 errmsg = "non-zero padding bits in shift sequence";
3961 goto utf7Error;
3962 }
3963 }
3964 }
3965 if (ch != '-') {
3966 /* '-' is absorbed; other terminating
3967 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003968 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3969 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003970 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003971 }
3972 }
3973 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003975 s++; /* consume '+' */
3976 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003977 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003978 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3979 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003980 }
3981 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003982 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003983 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003984 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003985 }
3986 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003987 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003988 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3989 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003990 s++;
3991 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003992 else {
3993 startinpos = s-starts;
3994 s++;
3995 errmsg = "unexpected special character";
3996 goto utf7Error;
3997 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003998 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003999utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 endinpos = s-starts;
4001 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 errors, &errorHandler,
4003 "utf7", errmsg,
4004 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004005 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004007 }
4008
Antoine Pitrou244651a2009-05-04 18:56:13 +00004009 /* end of string */
4010
4011 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4012 /* if we're in an inconsistent state, that's an error */
4013 if (surrogate ||
4014 (base64bits >= 6) ||
4015 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004016 endinpos = size;
4017 if (unicode_decode_call_errorhandler(
4018 errors, &errorHandler,
4019 "utf7", "unterminated shift sequence",
4020 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004021 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004022 goto onError;
4023 if (s < e)
4024 goto restart;
4025 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004026 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004027
4028 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004029 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004030 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004031 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004032 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004033 }
4034 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004035 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004036 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004037 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004038
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004039 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004040 goto onError;
4041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 Py_XDECREF(errorHandler);
4043 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004044 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004045
Benjamin Peterson29060642009-01-31 22:14:21 +00004046 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 Py_XDECREF(errorHandler);
4048 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004049 Py_DECREF(unicode);
4050 return NULL;
4051}
4052
4053
Alexander Belopolsky40018472011-02-26 01:02:56 +00004054PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004055_PyUnicode_EncodeUTF7(PyObject *str,
4056 int base64SetO,
4057 int base64WhiteSpace,
4058 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004059{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004060 int kind;
4061 void *data;
4062 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004063 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004064 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004065 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004066 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004067 unsigned int base64bits = 0;
4068 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004069 char * out;
4070 char * start;
4071
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004072 if (PyUnicode_READY(str) < 0)
4073 return NULL;
4074 kind = PyUnicode_KIND(str);
4075 data = PyUnicode_DATA(str);
4076 len = PyUnicode_GET_LENGTH(str);
4077
4078 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004079 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004080
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004081 /* It might be possible to tighten this worst case */
4082 allocated = 8 * len;
4083 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004084 return PyErr_NoMemory();
4085
Antoine Pitrou244651a2009-05-04 18:56:13 +00004086 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004087 if (v == NULL)
4088 return NULL;
4089
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004090 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004091 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004092 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004093
Antoine Pitrou244651a2009-05-04 18:56:13 +00004094 if (inShift) {
4095 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4096 /* shifting out */
4097 if (base64bits) { /* output remaining bits */
4098 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4099 base64buffer = 0;
4100 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004101 }
4102 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004103 /* Characters not in the BASE64 set implicitly unshift the sequence
4104 so no '-' is required, except if the character is itself a '-' */
4105 if (IS_BASE64(ch) || ch == '-') {
4106 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004107 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004108 *out++ = (char) ch;
4109 }
4110 else {
4111 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004112 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004113 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004114 else { /* not in a shift sequence */
4115 if (ch == '+') {
4116 *out++ = '+';
4117 *out++ = '-';
4118 }
4119 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4120 *out++ = (char) ch;
4121 }
4122 else {
4123 *out++ = '+';
4124 inShift = 1;
4125 goto encode_char;
4126 }
4127 }
4128 continue;
4129encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004130 if (ch >= 0x10000) {
4131 /* code first surrogate */
4132 base64bits += 16;
4133 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4134 while (base64bits >= 6) {
4135 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4136 base64bits -= 6;
4137 }
4138 /* prepare second surrogate */
4139 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4140 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004141 base64bits += 16;
4142 base64buffer = (base64buffer << 16) | ch;
4143 while (base64bits >= 6) {
4144 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4145 base64bits -= 6;
4146 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004147 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004148 if (base64bits)
4149 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4150 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004151 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004152 if (_PyBytes_Resize(&v, out - start) < 0)
4153 return NULL;
4154 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004155}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004156PyObject *
4157PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4158 Py_ssize_t size,
4159 int base64SetO,
4160 int base64WhiteSpace,
4161 const char *errors)
4162{
4163 PyObject *result;
4164 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4165 if (tmp == NULL)
4166 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004167 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004168 base64WhiteSpace, errors);
4169 Py_DECREF(tmp);
4170 return result;
4171}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004172
Antoine Pitrou244651a2009-05-04 18:56:13 +00004173#undef IS_BASE64
4174#undef FROM_BASE64
4175#undef TO_BASE64
4176#undef DECODE_DIRECT
4177#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004178
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179/* --- UTF-8 Codec -------------------------------------------------------- */
4180
Tim Petersced69f82003-09-16 20:30:58 +00004181static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004183 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4184 illegal prefix. See RFC 3629 for details */
4185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4197 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4198 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4199 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4200 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201};
4202
Alexander Belopolsky40018472011-02-26 01:02:56 +00004203PyObject *
4204PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004205 Py_ssize_t size,
4206 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207{
Walter Dörwald69652032004-09-07 20:24:22 +00004208 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4209}
4210
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004211#include "stringlib/ucs1lib.h"
4212#include "stringlib/codecs.h"
4213#include "stringlib/undef.h"
4214
4215#include "stringlib/ucs2lib.h"
4216#include "stringlib/codecs.h"
4217#include "stringlib/undef.h"
4218
4219#include "stringlib/ucs4lib.h"
4220#include "stringlib/codecs.h"
4221#include "stringlib/undef.h"
4222
Antoine Pitrouab868312009-01-10 15:40:25 +00004223/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4224#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4225
4226/* Mask to quickly check whether a C 'long' contains a
4227 non-ASCII, UTF8-encoded char. */
4228#if (SIZEOF_LONG == 8)
4229# define ASCII_CHAR_MASK 0x8080808080808080L
4230#elif (SIZEOF_LONG == 4)
4231# define ASCII_CHAR_MASK 0x80808080L
4232#else
4233# error C 'long' size should be either 4 or 8!
4234#endif
4235
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004236/* Scans a UTF-8 string and returns the maximum character to be expected
4237 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004239 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004240 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004241 */
4242static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004243utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4244 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004247 const unsigned char *p = (const unsigned char *)s;
4248 const unsigned char *end = p + string_size;
4249 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004250
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004251 assert(unicode_size != NULL);
4252
4253 /* By having a cascade of independent loops which fallback onto each
4254 other, we minimize the amount of work done in the average loop
4255 iteration, and we also maximize the CPU's ability to predict
4256 branches correctly (because a given condition will have always the
4257 same boolean outcome except perhaps in the last iteration of the
4258 corresponding loop).
4259 In the general case this brings us rather close to decoding
4260 performance pre-PEP 393, despite the two-pass decoding.
4261
4262 Note that the pure ASCII loop is not duplicated once a non-ASCII
4263 character has been encountered. It is actually a pessimization (by
4264 a significant factor) to use this loop on text with many non-ASCII
4265 characters, and it is important to avoid bad performance on valid
4266 utf-8 data (invalid utf-8 being a different can of worms).
4267 */
4268
4269 /* ASCII */
4270 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004271 /* Only check value if it's not a ASCII char... */
4272 if (*p < 0x80) {
4273 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4274 an explanation. */
4275 if (!((size_t) p & LONG_PTR_MASK)) {
4276 /* Help register allocation */
4277 register const unsigned char *_p = p;
4278 while (_p < aligned_end) {
4279 unsigned long value = *(unsigned long *) _p;
4280 if (value & ASCII_CHAR_MASK)
4281 break;
4282 _p += SIZEOF_LONG;
4283 char_count += SIZEOF_LONG;
4284 }
4285 p = _p;
4286 if (p == end)
4287 break;
4288 }
4289 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004290 if (*p < 0x80)
4291 ++char_count;
4292 else
4293 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004295 *unicode_size = char_count;
4296 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004297
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004298_ucs1loop:
4299 for (; p < end; ++p) {
4300 if (*p < 0xc4)
4301 char_count += ((*p & 0xc0) != 0x80);
4302 else
4303 goto _ucs2loop;
4304 }
4305 *unicode_size = char_count;
4306 return 255;
4307
4308_ucs2loop:
4309 for (; p < end; ++p) {
4310 if (*p < 0xf0)
4311 char_count += ((*p & 0xc0) != 0x80);
4312 else
4313 goto _ucs4loop;
4314 }
4315 *unicode_size = char_count;
4316 return 65535;
4317
4318_ucs4loop:
4319 for (; p < end; ++p) {
4320 char_count += ((*p & 0xc0) != 0x80);
4321 }
4322 *unicode_size = char_count;
4323 return 65537;
4324}
4325
4326/* Called when we encountered some error that wasn't detected in the original
4327 scan, e.g. an encoded surrogate character. The original maxchar computation
4328 may have been incorrect, so redo it. */
4329static int
4330refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4331{
4332 PyObject *tmp;
4333 Py_ssize_t k, maxchar;
4334 for (k = 0, maxchar = 0; k < n; k++)
4335 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4336 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4337 if (tmp == NULL)
4338 return -1;
4339 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4340 Py_DECREF(*unicode);
4341 *unicode = tmp;
4342 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004343}
4344
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004345/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4346 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4347 onError. Potential resizing overallocates, so the result needs to shrink
4348 at the end.
4349*/
4350#define WRITE_MAYBE_FAIL(index, value) \
4351 do { \
4352 if (has_errors) { \
4353 Py_ssize_t pos = index; \
4354 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4355 unicode_resize(&unicode, pos + pos/8) < 0) \
4356 goto onError; \
4357 if (unicode_putchar(&unicode, &pos, value) < 0) \
4358 goto onError; \
4359 } \
4360 else \
4361 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362 } while (0)
4363
Alexander Belopolsky40018472011-02-26 01:02:56 +00004364PyObject *
4365PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004366 Py_ssize_t size,
4367 const char *errors,
4368 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004369{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004370 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004372 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004373 Py_ssize_t startinpos;
4374 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004375 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004376 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004377 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004378 PyObject *errorHandler = NULL;
4379 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004380 Py_UCS4 maxchar = 0;
4381 Py_ssize_t unicode_size;
4382 Py_ssize_t i;
4383 int kind;
4384 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004385 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386
Walter Dörwald69652032004-09-07 20:24:22 +00004387 if (size == 0) {
4388 if (consumed)
4389 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004390 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004391 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004392 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004393 /* When the string is ASCII only, just use memcpy and return.
4394 unicode_size may be != size if there is an incomplete UTF-8
4395 sequence at the end of the ASCII block. */
4396 if (maxchar < 128 && size == unicode_size) {
Victor Stinner42885202011-11-22 01:23:02 +01004397 if (consumed)
4398 *consumed = size;
4399
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004400 if (size == 1)
4401 return get_latin1_char((unsigned char)s[0]);
4402
4403 unicode = PyUnicode_New(unicode_size, maxchar);
4404 if (!unicode)
4405 return NULL;
4406 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4407 assert(_PyUnicode_CheckConsistency(unicode, 1));
4408 return unicode;
4409 }
4410
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004411 /* In case of errors, maxchar and size computation might be incorrect;
4412 code below refits and resizes as necessary. */
4413 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 if (!unicode)
4415 return NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004416 kind = PyUnicode_KIND(unicode);
4417 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004418
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004420 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004422 switch (kind) {
4423 case PyUnicode_1BYTE_KIND:
4424 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4425 break;
4426 case PyUnicode_2BYTE_KIND:
4427 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4428 break;
4429 case PyUnicode_4BYTE_KIND:
4430 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4431 break;
4432 }
4433 if (!has_errors) {
4434 /* Ensure the unicode size calculation was correct */
4435 assert(i == unicode_size);
4436 assert(s == e);
4437 if (consumed)
4438 *consumed = s-starts;
4439 return unicode;
4440 }
4441 /* Fall through to the generic decoding loop for the rest of
4442 the string */
4443 if (refit_partial_string(&unicode, kind, data, i) < 0)
4444 goto onError;
4445
Antoine Pitrouab868312009-01-10 15:40:25 +00004446 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447
4448 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004449 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450
4451 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004452 /* Fast path for runs of ASCII characters. Given that common UTF-8
4453 input will consist of an overwhelming majority of ASCII
4454 characters, we try to optimize for this case by checking
4455 as many characters as a C 'long' can contain.
4456 First, check if we can do an aligned read, as most CPUs have
4457 a penalty for unaligned reads.
4458 */
4459 if (!((size_t) s & LONG_PTR_MASK)) {
4460 /* Help register allocation */
4461 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004462 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004463 while (_s < aligned_end) {
4464 /* Read a whole long at a time (either 4 or 8 bytes),
4465 and do a fast unrolled copy if it only contains ASCII
4466 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004467 unsigned long value = *(unsigned long *) _s;
4468 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004469 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004470 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4471 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4472 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4473 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004474#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004475 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4476 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4477 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4478 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004479#endif
4480 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004481 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004482 }
4483 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004484 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004485 if (s == e)
4486 break;
4487 ch = (unsigned char)*s;
4488 }
4489 }
4490
4491 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004492 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 s++;
4494 continue;
4495 }
4496
4497 n = utf8_code_length[ch];
4498
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004499 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004500 if (consumed)
4501 break;
4502 else {
4503 errmsg = "unexpected end of data";
4504 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004505 endinpos = startinpos+1;
4506 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4507 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 goto utf8Error;
4509 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004510 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004511
4512 switch (n) {
4513
4514 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004515 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 startinpos = s-starts;
4517 endinpos = startinpos+1;
4518 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519
4520 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004521 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 startinpos = s-starts;
4523 endinpos = startinpos+1;
4524 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525
4526 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004527 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004528 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004530 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004531 goto utf8Error;
4532 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004534 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004535 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536 break;
4537
4538 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004539 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4540 will result in surrogates in range d800-dfff. Surrogates are
4541 not valid UTF-8 so they are rejected.
4542 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4543 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004544 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004545 (s[2] & 0xc0) != 0x80 ||
4546 ((unsigned char)s[0] == 0xE0 &&
4547 (unsigned char)s[1] < 0xA0) ||
4548 ((unsigned char)s[0] == 0xED &&
4549 (unsigned char)s[1] > 0x9F)) {
4550 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004551 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004552 endinpos = startinpos + 1;
4553
4554 /* if s[1] first two bits are 1 and 0, then the invalid
4555 continuation byte is s[2], so increment endinpos by 1,
4556 if not, s[1] is invalid and endinpos doesn't need to
4557 be incremented. */
4558 if ((s[1] & 0xC0) == 0x80)
4559 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004560 goto utf8Error;
4561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004563 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004564 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004565 break;
4566
4567 case 4:
4568 if ((s[1] & 0xc0) != 0x80 ||
4569 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004570 (s[3] & 0xc0) != 0x80 ||
4571 ((unsigned char)s[0] == 0xF0 &&
4572 (unsigned char)s[1] < 0x90) ||
4573 ((unsigned char)s[0] == 0xF4 &&
4574 (unsigned char)s[1] > 0x8F)) {
4575 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004577 endinpos = startinpos + 1;
4578 if ((s[1] & 0xC0) == 0x80) {
4579 endinpos++;
4580 if ((s[2] & 0xC0) == 0x80)
4581 endinpos++;
4582 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004583 goto utf8Error;
4584 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004585 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004586 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4587 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4588
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591 }
4592 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004594
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004596 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004597 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004598 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004599 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004600 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004601 if (unicode_decode_call_errorhandler(
4602 errors, &errorHandler,
4603 "utf8", errmsg,
4604 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004605 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004606 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004607 /* Update data because unicode_decode_call_errorhandler might have
4608 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004609 data = PyUnicode_DATA(unicode);
4610 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004611 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004613 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004615
Walter Dörwald69652032004-09-07 20:24:22 +00004616 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004617 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004619 /* Adjust length and ready string when it contained errors and
4620 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004621 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004622 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004623 goto onError;
4624 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004626 Py_XDECREF(errorHandler);
4627 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004628 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004629 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630
Benjamin Peterson29060642009-01-31 22:14:21 +00004631 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004632 Py_XDECREF(errorHandler);
4633 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634 Py_DECREF(unicode);
4635 return NULL;
4636}
4637
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004638#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004639
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004640#ifdef __APPLE__
4641
4642/* Simplified UTF-8 decoder using surrogateescape error handler,
4643 used to decode the command line arguments on Mac OS X. */
4644
4645wchar_t*
4646_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4647{
4648 int n;
4649 const char *e;
4650 wchar_t *unicode, *p;
4651
4652 /* Note: size will always be longer than the resulting Unicode
4653 character count */
4654 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4655 PyErr_NoMemory();
4656 return NULL;
4657 }
4658 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4659 if (!unicode)
4660 return NULL;
4661
4662 /* Unpack UTF-8 encoded data */
4663 p = unicode;
4664 e = s + size;
4665 while (s < e) {
4666 Py_UCS4 ch = (unsigned char)*s;
4667
4668 if (ch < 0x80) {
4669 *p++ = (wchar_t)ch;
4670 s++;
4671 continue;
4672 }
4673
4674 n = utf8_code_length[ch];
4675 if (s + n > e) {
4676 goto surrogateescape;
4677 }
4678
4679 switch (n) {
4680 case 0:
4681 case 1:
4682 goto surrogateescape;
4683
4684 case 2:
4685 if ((s[1] & 0xc0) != 0x80)
4686 goto surrogateescape;
4687 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4688 assert ((ch > 0x007F) && (ch <= 0x07FF));
4689 *p++ = (wchar_t)ch;
4690 break;
4691
4692 case 3:
4693 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4694 will result in surrogates in range d800-dfff. Surrogates are
4695 not valid UTF-8 so they are rejected.
4696 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4697 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4698 if ((s[1] & 0xc0) != 0x80 ||
4699 (s[2] & 0xc0) != 0x80 ||
4700 ((unsigned char)s[0] == 0xE0 &&
4701 (unsigned char)s[1] < 0xA0) ||
4702 ((unsigned char)s[0] == 0xED &&
4703 (unsigned char)s[1] > 0x9F)) {
4704
4705 goto surrogateescape;
4706 }
4707 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4708 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004709 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004710 break;
4711
4712 case 4:
4713 if ((s[1] & 0xc0) != 0x80 ||
4714 (s[2] & 0xc0) != 0x80 ||
4715 (s[3] & 0xc0) != 0x80 ||
4716 ((unsigned char)s[0] == 0xF0 &&
4717 (unsigned char)s[1] < 0x90) ||
4718 ((unsigned char)s[0] == 0xF4 &&
4719 (unsigned char)s[1] > 0x8F)) {
4720 goto surrogateescape;
4721 }
4722 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4723 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4724 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4725
4726#if SIZEOF_WCHAR_T == 4
4727 *p++ = (wchar_t)ch;
4728#else
4729 /* compute and append the two surrogates: */
4730
4731 /* translate from 10000..10FFFF to 0..FFFF */
4732 ch -= 0x10000;
4733
4734 /* high surrogate = top 10 bits added to D800 */
4735 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4736
4737 /* low surrogate = bottom 10 bits added to DC00 */
4738 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4739#endif
4740 break;
4741 }
4742 s += n;
4743 continue;
4744
4745 surrogateescape:
4746 *p++ = 0xDC00 + ch;
4747 s++;
4748 }
4749 *p = L'\0';
4750 return unicode;
4751}
4752
4753#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004755/* Primary internal function which creates utf8 encoded bytes objects.
4756
4757 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004758 and allocate exactly as much space needed at the end. Else allocate the
4759 maximum possible needed (4 result bytes per Unicode character), and return
4760 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004761*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004762PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004763_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764{
Tim Peters602f7402002-04-27 18:03:26 +00004765#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004766
Guido van Rossum98297ee2007-11-06 21:34:58 +00004767 Py_ssize_t i; /* index into s of next input byte */
4768 PyObject *result; /* result string object */
4769 char *p; /* next free byte in output buffer */
4770 Py_ssize_t nallocated; /* number of result bytes allocated */
4771 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004772 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004773 PyObject *errorHandler = NULL;
4774 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004775 int kind;
4776 void *data;
4777 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004778 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780 if (!PyUnicode_Check(unicode)) {
4781 PyErr_BadArgument();
4782 return NULL;
4783 }
4784
4785 if (PyUnicode_READY(unicode) == -1)
4786 return NULL;
4787
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004788 if (PyUnicode_UTF8(unicode))
4789 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4790 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791
4792 kind = PyUnicode_KIND(unicode);
4793 data = PyUnicode_DATA(unicode);
4794 size = PyUnicode_GET_LENGTH(unicode);
4795
Tim Peters602f7402002-04-27 18:03:26 +00004796 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797
Tim Peters602f7402002-04-27 18:03:26 +00004798 if (size <= MAX_SHORT_UNICHARS) {
4799 /* Write into the stack buffer; nallocated can't overflow.
4800 * At the end, we'll allocate exactly as much heap space as it
4801 * turns out we need.
4802 */
4803 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004804 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004805 p = stackbuf;
4806 }
4807 else {
4808 /* Overallocate on the heap, and give the excess back at the end. */
4809 nallocated = size * 4;
4810 if (nallocated / 4 != size) /* overflow! */
4811 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004812 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004813 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004814 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004815 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004816 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004817
Tim Peters602f7402002-04-27 18:03:26 +00004818 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004819 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004820
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004821 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004822 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004824
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004826 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004827 *p++ = (char)(0xc0 | (ch >> 6));
4828 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004829 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004830 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004831 Py_ssize_t repsize, k, startpos;
4832 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004833 rep = unicode_encode_call_errorhandler(
4834 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004835 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004836 if (!rep)
4837 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004839 if (PyBytes_Check(rep))
4840 repsize = PyBytes_GET_SIZE(rep);
4841 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004842 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004843
4844 if (repsize > 4) {
4845 Py_ssize_t offset;
4846
4847 if (result == NULL)
4848 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004849 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004850 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004852 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4853 /* integer overflow */
4854 PyErr_NoMemory();
4855 goto error;
4856 }
4857 nallocated += repsize - 4;
4858 if (result != NULL) {
4859 if (_PyBytes_Resize(&result, nallocated) < 0)
4860 goto error;
4861 } else {
4862 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004863 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004864 goto error;
4865 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4866 }
4867 p = PyBytes_AS_STRING(result) + offset;
4868 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004870 if (PyBytes_Check(rep)) {
4871 char *prep = PyBytes_AS_STRING(rep);
4872 for(k = repsize; k > 0; k--)
4873 *p++ = *prep++;
4874 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004875 enum PyUnicode_Kind repkind;
4876 void *repdata;
4877
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004878 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004879 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004880 repkind = PyUnicode_KIND(rep);
4881 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882
4883 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004884 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004885 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004886 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004887 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004888 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004889 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004890 goto error;
4891 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004892 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004893 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004894 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004895 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004896 } else if (ch < 0x10000) {
4897 *p++ = (char)(0xe0 | (ch >> 12));
4898 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4899 *p++ = (char)(0x80 | (ch & 0x3f));
4900 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004901 /* Encode UCS4 Unicode ordinals */
4902 *p++ = (char)(0xf0 | (ch >> 18));
4903 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4904 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4905 *p++ = (char)(0x80 | (ch & 0x3f));
4906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004908
Guido van Rossum98297ee2007-11-06 21:34:58 +00004909 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004910 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004911 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004912 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004913 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004914 }
4915 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004916 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004917 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004918 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004919 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004921
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004922 Py_XDECREF(errorHandler);
4923 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004924 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004925 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004926 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004927 Py_XDECREF(errorHandler);
4928 Py_XDECREF(exc);
4929 Py_XDECREF(result);
4930 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004931
Tim Peters602f7402002-04-27 18:03:26 +00004932#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933}
4934
Alexander Belopolsky40018472011-02-26 01:02:56 +00004935PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004936PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4937 Py_ssize_t size,
4938 const char *errors)
4939{
4940 PyObject *v, *unicode;
4941
4942 unicode = PyUnicode_FromUnicode(s, size);
4943 if (unicode == NULL)
4944 return NULL;
4945 v = _PyUnicode_AsUTF8String(unicode, errors);
4946 Py_DECREF(unicode);
4947 return v;
4948}
4949
4950PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004951PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004953 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954}
4955
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956/* --- UTF-32 Codec ------------------------------------------------------- */
4957
4958PyObject *
4959PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004960 Py_ssize_t size,
4961 const char *errors,
4962 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004963{
4964 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4965}
4966
4967PyObject *
4968PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 Py_ssize_t size,
4970 const char *errors,
4971 int *byteorder,
4972 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004973{
4974 const char *starts = s;
4975 Py_ssize_t startinpos;
4976 Py_ssize_t endinpos;
4977 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004978 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004979 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004980 int bo = 0; /* assume native ordering by default */
4981 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982 /* Offsets from q for retrieving bytes in the right order. */
4983#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4984 int iorder[] = {0, 1, 2, 3};
4985#else
4986 int iorder[] = {3, 2, 1, 0};
4987#endif
4988 PyObject *errorHandler = NULL;
4989 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004990
Walter Dörwald41980ca2007-08-16 21:55:45 +00004991 q = (unsigned char *)s;
4992 e = q + size;
4993
4994 if (byteorder)
4995 bo = *byteorder;
4996
4997 /* Check for BOM marks (U+FEFF) in the input and adjust current
4998 byte order setting accordingly. In native mode, the leading BOM
4999 mark is skipped, in all other modes, it is copied to the output
5000 stream as-is (giving a ZWNBSP character). */
5001 if (bo == 0) {
5002 if (size >= 4) {
5003 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005004 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 if (bom == 0x0000FEFF) {
5007 q += 4;
5008 bo = -1;
5009 }
5010 else if (bom == 0xFFFE0000) {
5011 q += 4;
5012 bo = 1;
5013 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005014#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005015 if (bom == 0x0000FEFF) {
5016 q += 4;
5017 bo = 1;
5018 }
5019 else if (bom == 0xFFFE0000) {
5020 q += 4;
5021 bo = -1;
5022 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005023#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005024 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005025 }
5026
5027 if (bo == -1) {
5028 /* force LE */
5029 iorder[0] = 0;
5030 iorder[1] = 1;
5031 iorder[2] = 2;
5032 iorder[3] = 3;
5033 }
5034 else if (bo == 1) {
5035 /* force BE */
5036 iorder[0] = 3;
5037 iorder[1] = 2;
5038 iorder[2] = 1;
5039 iorder[3] = 0;
5040 }
5041
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005042 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005043 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005044 if (!unicode)
5045 return NULL;
5046 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005047 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005048 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005049
Walter Dörwald41980ca2007-08-16 21:55:45 +00005050 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 Py_UCS4 ch;
5052 /* remaining bytes at the end? (size should be divisible by 4) */
5053 if (e-q<4) {
5054 if (consumed)
5055 break;
5056 errmsg = "truncated data";
5057 startinpos = ((const char *)q)-starts;
5058 endinpos = ((const char *)e)-starts;
5059 goto utf32Error;
5060 /* The remaining input chars are ignored if the callback
5061 chooses to skip the input */
5062 }
5063 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5064 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065
Benjamin Peterson29060642009-01-31 22:14:21 +00005066 if (ch >= 0x110000)
5067 {
5068 errmsg = "codepoint not in range(0x110000)";
5069 startinpos = ((const char *)q)-starts;
5070 endinpos = startinpos+4;
5071 goto utf32Error;
5072 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005073 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5074 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 q += 4;
5076 continue;
5077 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005078 if (unicode_decode_call_errorhandler(
5079 errors, &errorHandler,
5080 "utf32", errmsg,
5081 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005082 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005083 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084 }
5085
5086 if (byteorder)
5087 *byteorder = bo;
5088
5089 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005090 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091
5092 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005093 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 goto onError;
5095
5096 Py_XDECREF(errorHandler);
5097 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005098 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005099
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005101 Py_DECREF(unicode);
5102 Py_XDECREF(errorHandler);
5103 Py_XDECREF(exc);
5104 return NULL;
5105}
5106
5107PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005108_PyUnicode_EncodeUTF32(PyObject *str,
5109 const char *errors,
5110 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005112 int kind;
5113 void *data;
5114 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005115 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005116 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005117 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005118 /* Offsets from p for storing byte pairs in the right order. */
5119#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5120 int iorder[] = {0, 1, 2, 3};
5121#else
5122 int iorder[] = {3, 2, 1, 0};
5123#endif
5124
Benjamin Peterson29060642009-01-31 22:14:21 +00005125#define STORECHAR(CH) \
5126 do { \
5127 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5128 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5129 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5130 p[iorder[0]] = (CH) & 0xff; \
5131 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132 } while(0)
5133
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005134 if (!PyUnicode_Check(str)) {
5135 PyErr_BadArgument();
5136 return NULL;
5137 }
5138 if (PyUnicode_READY(str) < 0)
5139 return NULL;
5140 kind = PyUnicode_KIND(str);
5141 data = PyUnicode_DATA(str);
5142 len = PyUnicode_GET_LENGTH(str);
5143
5144 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005145 bytesize = nsize * 4;
5146 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005147 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005148 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005149 if (v == NULL)
5150 return NULL;
5151
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005152 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005153 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005154 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005155 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005156 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005157
5158 if (byteorder == -1) {
5159 /* force LE */
5160 iorder[0] = 0;
5161 iorder[1] = 1;
5162 iorder[2] = 2;
5163 iorder[3] = 3;
5164 }
5165 else if (byteorder == 1) {
5166 /* force BE */
5167 iorder[0] = 3;
5168 iorder[1] = 2;
5169 iorder[2] = 1;
5170 iorder[3] = 0;
5171 }
5172
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005173 for (i = 0; i < len; i++)
5174 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005175
5176 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005177 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005178#undef STORECHAR
5179}
5180
Alexander Belopolsky40018472011-02-26 01:02:56 +00005181PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005182PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5183 Py_ssize_t size,
5184 const char *errors,
5185 int byteorder)
5186{
5187 PyObject *result;
5188 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5189 if (tmp == NULL)
5190 return NULL;
5191 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5192 Py_DECREF(tmp);
5193 return result;
5194}
5195
5196PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005197PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005198{
Victor Stinnerb960b342011-11-20 19:12:52 +01005199 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005200}
5201
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202/* --- UTF-16 Codec ------------------------------------------------------- */
5203
Tim Peters772747b2001-08-09 22:21:55 +00005204PyObject *
5205PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 Py_ssize_t size,
5207 const char *errors,
5208 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209{
Walter Dörwald69652032004-09-07 20:24:22 +00005210 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5211}
5212
Antoine Pitrouab868312009-01-10 15:40:25 +00005213/* Two masks for fast checking of whether a C 'long' may contain
5214 UTF16-encoded surrogate characters. This is an efficient heuristic,
5215 assuming that non-surrogate characters with a code point >= 0x8000 are
5216 rare in most input.
5217 FAST_CHAR_MASK is used when the input is in native byte ordering,
5218 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005219*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005220#if (SIZEOF_LONG == 8)
5221# define FAST_CHAR_MASK 0x8000800080008000L
5222# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5223#elif (SIZEOF_LONG == 4)
5224# define FAST_CHAR_MASK 0x80008000L
5225# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5226#else
5227# error C 'long' size should be either 4 or 8!
5228#endif
5229
Walter Dörwald69652032004-09-07 20:24:22 +00005230PyObject *
5231PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 Py_ssize_t size,
5233 const char *errors,
5234 int *byteorder,
5235 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005236{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005237 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005238 Py_ssize_t startinpos;
5239 Py_ssize_t endinpos;
5240 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005241 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005242 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005243 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005244 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005245 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005246 /* Offsets from q for retrieving byte pairs in the right order. */
5247#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5248 int ihi = 1, ilo = 0;
5249#else
5250 int ihi = 0, ilo = 1;
5251#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005252 PyObject *errorHandler = NULL;
5253 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254
5255 /* Note: size will always be longer than the resulting Unicode
5256 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005257 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 if (!unicode)
5259 return NULL;
5260 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005261 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005262 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263
Tim Peters772747b2001-08-09 22:21:55 +00005264 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005265 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266
5267 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005268 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005270 /* Check for BOM marks (U+FEFF) in the input and adjust current
5271 byte order setting accordingly. In native mode, the leading BOM
5272 mark is skipped, in all other modes, it is copied to the output
5273 stream as-is (giving a ZWNBSP character). */
5274 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005275 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005276 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005277#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 if (bom == 0xFEFF) {
5279 q += 2;
5280 bo = -1;
5281 }
5282 else if (bom == 0xFFFE) {
5283 q += 2;
5284 bo = 1;
5285 }
Tim Petersced69f82003-09-16 20:30:58 +00005286#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 if (bom == 0xFEFF) {
5288 q += 2;
5289 bo = 1;
5290 }
5291 else if (bom == 0xFFFE) {
5292 q += 2;
5293 bo = -1;
5294 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005295#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005296 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005297 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298
Tim Peters772747b2001-08-09 22:21:55 +00005299 if (bo == -1) {
5300 /* force LE */
5301 ihi = 1;
5302 ilo = 0;
5303 }
5304 else if (bo == 1) {
5305 /* force BE */
5306 ihi = 0;
5307 ilo = 1;
5308 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005309#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5310 native_ordering = ilo < ihi;
5311#else
5312 native_ordering = ilo > ihi;
5313#endif
Tim Peters772747b2001-08-09 22:21:55 +00005314
Antoine Pitrouab868312009-01-10 15:40:25 +00005315 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005316 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005317 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005318 /* First check for possible aligned read of a C 'long'. Unaligned
5319 reads are more expensive, better to defer to another iteration. */
5320 if (!((size_t) q & LONG_PTR_MASK)) {
5321 /* Fast path for runs of non-surrogate chars. */
5322 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005323 int kind = PyUnicode_KIND(unicode);
5324 void *data = PyUnicode_DATA(unicode);
5325 while (_q < aligned_end) {
5326 unsigned long block = * (unsigned long *) _q;
5327 unsigned short *pblock = (unsigned short*)&block;
5328 Py_UCS4 maxch;
5329 if (native_ordering) {
5330 /* Can use buffer directly */
5331 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005332 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005333 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005334 else {
5335 /* Need to byte-swap */
5336 unsigned char *_p = (unsigned char*)pblock;
5337 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005338 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005339 _p[0] = _q[1];
5340 _p[1] = _q[0];
5341 _p[2] = _q[3];
5342 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005343#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005344 _p[4] = _q[5];
5345 _p[5] = _q[4];
5346 _p[6] = _q[7];
5347 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005348#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005349 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005350 maxch = Py_MAX(pblock[0], pblock[1]);
5351#if SIZEOF_LONG == 8
5352 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5353#endif
5354 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5355 if (unicode_widen(&unicode, maxch) < 0)
5356 goto onError;
5357 kind = PyUnicode_KIND(unicode);
5358 data = PyUnicode_DATA(unicode);
5359 }
5360 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5361 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5362#if SIZEOF_LONG == 8
5363 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5364 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5365#endif
5366 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005367 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005368 q = _q;
5369 if (q >= e)
5370 break;
5371 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005373
Benjamin Peterson14339b62009-01-31 16:36:08 +00005374 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005375
5376 if (ch < 0xD800 || ch > 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005377 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5378 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 continue;
5380 }
5381
5382 /* UTF-16 code pair: */
5383 if (q > e) {
5384 errmsg = "unexpected end of data";
5385 startinpos = (((const char *)q) - 2) - starts;
5386 endinpos = ((const char *)e) + 1 - starts;
5387 goto utf16Error;
5388 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005389 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5390 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005392 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005393 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005394 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005395 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 continue;
5397 }
5398 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005399 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 startinpos = (((const char *)q)-4)-starts;
5401 endinpos = startinpos+2;
5402 goto utf16Error;
5403 }
5404
Benjamin Peterson14339b62009-01-31 16:36:08 +00005405 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 errmsg = "illegal encoding";
5407 startinpos = (((const char *)q)-2)-starts;
5408 endinpos = startinpos+2;
5409 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005410
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005413 errors,
5414 &errorHandler,
5415 "utf16", errmsg,
5416 &starts,
5417 (const char **)&e,
5418 &startinpos,
5419 &endinpos,
5420 &exc,
5421 (const char **)&q,
5422 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005423 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005426 /* remaining byte at the end? (size should be even) */
5427 if (e == q) {
5428 if (!consumed) {
5429 errmsg = "truncated data";
5430 startinpos = ((const char *)q) - starts;
5431 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005432 if (unicode_decode_call_errorhandler(
5433 errors,
5434 &errorHandler,
5435 "utf16", errmsg,
5436 &starts,
5437 (const char **)&e,
5438 &startinpos,
5439 &endinpos,
5440 &exc,
5441 (const char **)&q,
5442 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005443 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005444 goto onError;
5445 /* The remaining input chars are ignored if the callback
5446 chooses to skip the input */
5447 }
5448 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449
5450 if (byteorder)
5451 *byteorder = bo;
5452
Walter Dörwald69652032004-09-07 20:24:22 +00005453 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005454 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005455
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005457 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 goto onError;
5459
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005460 Py_XDECREF(errorHandler);
5461 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005462 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463
Benjamin Peterson29060642009-01-31 22:14:21 +00005464 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005466 Py_XDECREF(errorHandler);
5467 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468 return NULL;
5469}
5470
Antoine Pitrouab868312009-01-10 15:40:25 +00005471#undef FAST_CHAR_MASK
5472#undef SWAPPED_FAST_CHAR_MASK
5473
Tim Peters772747b2001-08-09 22:21:55 +00005474PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475_PyUnicode_EncodeUTF16(PyObject *str,
5476 const char *errors,
5477 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005479 int kind;
5480 void *data;
5481 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005482 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005483 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005484 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005485 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005486 /* Offsets from p for storing byte pairs in the right order. */
5487#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5488 int ihi = 1, ilo = 0;
5489#else
5490 int ihi = 0, ilo = 1;
5491#endif
5492
Benjamin Peterson29060642009-01-31 22:14:21 +00005493#define STORECHAR(CH) \
5494 do { \
5495 p[ihi] = ((CH) >> 8) & 0xff; \
5496 p[ilo] = (CH) & 0xff; \
5497 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005498 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005500 if (!PyUnicode_Check(str)) {
5501 PyErr_BadArgument();
5502 return NULL;
5503 }
5504 if (PyUnicode_READY(str) < 0)
5505 return NULL;
5506 kind = PyUnicode_KIND(str);
5507 data = PyUnicode_DATA(str);
5508 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005509
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005510 pairs = 0;
5511 if (kind == PyUnicode_4BYTE_KIND)
5512 for (i = 0; i < len; i++)
5513 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5514 pairs++;
5515 /* 2 * (len + pairs + (byteorder == 0)) */
5516 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005518 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005519 bytesize = nsize * 2;
5520 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005522 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 if (v == NULL)
5524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005526 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005529 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005530 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005531
5532 if (byteorder == -1) {
5533 /* force LE */
5534 ihi = 1;
5535 ilo = 0;
5536 }
5537 else if (byteorder == 1) {
5538 /* force BE */
5539 ihi = 0;
5540 ilo = 1;
5541 }
5542
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005543 for (i = 0; i < len; i++) {
5544 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5545 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005546 if (ch >= 0x10000) {
5547 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5548 ch = 0xD800 | ((ch-0x10000) >> 10);
5549 }
Tim Peters772747b2001-08-09 22:21:55 +00005550 STORECHAR(ch);
5551 if (ch2)
5552 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005553 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005554
5555 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005556 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005557#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558}
5559
Alexander Belopolsky40018472011-02-26 01:02:56 +00005560PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005561PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5562 Py_ssize_t size,
5563 const char *errors,
5564 int byteorder)
5565{
5566 PyObject *result;
5567 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5568 if (tmp == NULL)
5569 return NULL;
5570 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5571 Py_DECREF(tmp);
5572 return result;
5573}
5574
5575PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005576PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005578 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579}
5580
5581/* --- Unicode Escape Codec ----------------------------------------------- */
5582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005583/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5584 if all the escapes in the string make it still a valid ASCII string.
5585 Returns -1 if any escapes were found which cause the string to
5586 pop out of ASCII range. Otherwise returns the length of the
5587 required buffer to hold the string.
5588 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005589static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005590length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5591{
5592 const unsigned char *p = (const unsigned char *)s;
5593 const unsigned char *end = p + size;
5594 Py_ssize_t length = 0;
5595
5596 if (size < 0)
5597 return -1;
5598
5599 for (; p < end; ++p) {
5600 if (*p > 127) {
5601 /* Non-ASCII */
5602 return -1;
5603 }
5604 else if (*p != '\\') {
5605 /* Normal character */
5606 ++length;
5607 }
5608 else {
5609 /* Backslash-escape, check next char */
5610 ++p;
5611 /* Escape sequence reaches till end of string or
5612 non-ASCII follow-up. */
5613 if (p >= end || *p > 127)
5614 return -1;
5615 switch (*p) {
5616 case '\n':
5617 /* backslash + \n result in zero characters */
5618 break;
5619 case '\\': case '\'': case '\"':
5620 case 'b': case 'f': case 't':
5621 case 'n': case 'r': case 'v': case 'a':
5622 ++length;
5623 break;
5624 case '0': case '1': case '2': case '3':
5625 case '4': case '5': case '6': case '7':
5626 case 'x': case 'u': case 'U': case 'N':
5627 /* these do not guarantee ASCII characters */
5628 return -1;
5629 default:
5630 /* count the backslash + the other character */
5631 length += 2;
5632 }
5633 }
5634 }
5635 return length;
5636}
5637
Fredrik Lundh06d12682001-01-24 07:59:11 +00005638static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005639
Alexander Belopolsky40018472011-02-26 01:02:56 +00005640PyObject *
5641PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005642 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005643 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005646 Py_ssize_t startinpos;
5647 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005648 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005649 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005650 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005651 char* message;
5652 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005653 PyObject *errorHandler = NULL;
5654 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005655 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005657
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005658 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005659
5660 /* After length_of_escaped_ascii_string() there are two alternatives,
5661 either the string is pure ASCII with named escapes like \n, etc.
5662 and we determined it's exact size (common case)
5663 or it contains \x, \u, ... escape sequences. then we create a
5664 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005665 if (len >= 0) {
5666 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005667 if (!v)
5668 goto onError;
5669 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670 }
5671 else {
5672 /* Escaped strings will always be longer than the resulting
5673 Unicode string, so we start with size here and then reduce the
5674 length after conversion to the true value.
5675 (but if the error callback returns a long replacement string
5676 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005677 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678 if (!v)
5679 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005680 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005681 }
5682
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005684 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005685 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 while (s < end) {
5689 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005690 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005693 /* The only case in which i == ascii_length is a backslash
5694 followed by a newline. */
5695 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 /* Non-escape characters are interpreted as Unicode ordinals */
5698 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005699 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 continue;
5702 }
5703
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005704 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 /* \ - Escapes */
5706 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005707 c = *s++;
5708 if (s > end)
5709 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005710
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005711 /* The only case in which i == ascii_length is a backslash
5712 followed by a newline. */
5713 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005714
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005715 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005718#define WRITECHAR(ch) \
5719 do { \
5720 if (unicode_putchar(&v, &i, ch) < 0) \
5721 goto onError; \
5722 }while(0)
5723
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005725 case '\\': WRITECHAR('\\'); break;
5726 case '\'': WRITECHAR('\''); break;
5727 case '\"': WRITECHAR('\"'); break;
5728 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005729 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005730 case 'f': WRITECHAR('\014'); break;
5731 case 't': WRITECHAR('\t'); break;
5732 case 'n': WRITECHAR('\n'); break;
5733 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005734 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005735 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005736 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005737 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 case '0': case '1': case '2': case '3':
5741 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005742 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005743 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005744 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005745 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005746 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005748 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 break;
5750
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 /* hex escapes */
5752 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005754 digits = 2;
5755 message = "truncated \\xXX escape";
5756 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
Benjamin Peterson29060642009-01-31 22:14:21 +00005758 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005760 digits = 4;
5761 message = "truncated \\uXXXX escape";
5762 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763
Benjamin Peterson29060642009-01-31 22:14:21 +00005764 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005765 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005766 digits = 8;
5767 message = "truncated \\UXXXXXXXX escape";
5768 hexescape:
5769 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005770 if (s+digits>end) {
5771 endinpos = size;
5772 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 errors, &errorHandler,
5774 "unicodeescape", "end of string in escape sequence",
5775 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005776 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 goto onError;
5778 goto nextByte;
5779 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005780 for (j = 0; j < digits; ++j) {
5781 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005782 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 errors, &errorHandler,
5786 "unicodeescape", message,
5787 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005788 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005789 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005790 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005792 }
5793 chr = (chr<<4) & ~0xF;
5794 if (c >= '0' && c <= '9')
5795 chr += c - '0';
5796 else if (c >= 'a' && c <= 'f')
5797 chr += 10 + c - 'a';
5798 else
5799 chr += 10 + c - 'A';
5800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005801 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005802 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005803 /* _decoding_error will have already written into the
5804 target buffer. */
5805 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005806 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005807 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005808 if (chr <= 0x10ffff) {
5809 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005810 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005812 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 errors, &errorHandler,
5814 "unicodeescape", "illegal Unicode character",
5815 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005816 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005817 goto onError;
5818 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005819 break;
5820
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005822 case 'N':
5823 message = "malformed \\N character escape";
5824 if (ucnhash_CAPI == NULL) {
5825 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005826 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5827 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005828 if (ucnhash_CAPI == NULL)
5829 goto ucnhashError;
5830 }
5831 if (*s == '{') {
5832 const char *start = s+1;
5833 /* look for the closing brace */
5834 while (*s != '}' && s < end)
5835 s++;
5836 if (s > start && s < end && *s == '}') {
5837 /* found a name. look it up in the unicode database */
5838 message = "unknown Unicode character name";
5839 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005840 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005841 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005842 goto store;
5843 }
5844 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005845 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 errors, &errorHandler,
5848 "unicodeescape", message,
5849 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005850 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005851 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005852 break;
5853
5854 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005855 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856 message = "\\ at end of string";
5857 s--;
5858 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005859 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005860 errors, &errorHandler,
5861 "unicodeescape", message,
5862 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005863 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005864 goto onError;
5865 }
5866 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005867 WRITECHAR('\\');
5868 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005869 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005870 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005873 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005875#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005876
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005877 if (PyUnicode_Resize(&v, i) < 0)
5878 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005879 Py_XDECREF(errorHandler);
5880 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005881 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005882
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005884 PyErr_SetString(
5885 PyExc_UnicodeError,
5886 "\\N escapes not supported (can't load unicodedata module)"
5887 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005888 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 Py_XDECREF(errorHandler);
5890 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005891 return NULL;
5892
Benjamin Peterson29060642009-01-31 22:14:21 +00005893 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005895 Py_XDECREF(errorHandler);
5896 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 return NULL;
5898}
5899
5900/* Return a Unicode-Escape string version of the Unicode object.
5901
5902 If quotes is true, the string is enclosed in u"" or u'' quotes as
5903 appropriate.
5904
5905*/
5906
Alexander Belopolsky40018472011-02-26 01:02:56 +00005907PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005908PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005911 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005913 int kind;
5914 void *data;
5915 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
Thomas Wouters89f507f2006-12-13 04:49:30 +00005917 /* Initial allocation is based on the longest-possible unichr
5918 escape.
5919
5920 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5921 unichr, so in this case it's the longest unichr escape. In
5922 narrow (UTF-16) builds this is five chars per source unichr
5923 since there are two unichrs in the surrogate pair, so in narrow
5924 (UTF-16) builds it's not the longest unichr escape.
5925
5926 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5927 so in the narrow (UTF-16) build case it's the longest unichr
5928 escape.
5929 */
5930
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005931 if (!PyUnicode_Check(unicode)) {
5932 PyErr_BadArgument();
5933 return NULL;
5934 }
5935 if (PyUnicode_READY(unicode) < 0)
5936 return NULL;
5937 len = PyUnicode_GET_LENGTH(unicode);
5938 kind = PyUnicode_KIND(unicode);
5939 data = PyUnicode_DATA(unicode);
5940 switch(kind) {
5941 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5942 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5943 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5944 }
5945
5946 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005947 return PyBytes_FromStringAndSize(NULL, 0);
5948
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005949 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005951
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005954 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005955 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956 if (repr == NULL)
5957 return NULL;
5958
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005959 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005961 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005962 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005963
Walter Dörwald79e913e2007-05-12 11:08:06 +00005964 /* Escape backslashes */
5965 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 *p++ = '\\';
5967 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005968 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005969 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005970
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005971 /* Map 21-bit characters to '\U00xxxxxx' */
5972 else if (ch >= 0x10000) {
5973 *p++ = '\\';
5974 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005975 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5976 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5977 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5978 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5979 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5980 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5981 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5982 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005984 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005985
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005987 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988 *p++ = '\\';
5989 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005990 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5991 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5992 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5993 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005995
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005996 /* Map special whitespace to '\t', \n', '\r' */
5997 else if (ch == '\t') {
5998 *p++ = '\\';
5999 *p++ = 't';
6000 }
6001 else if (ch == '\n') {
6002 *p++ = '\\';
6003 *p++ = 'n';
6004 }
6005 else if (ch == '\r') {
6006 *p++ = '\\';
6007 *p++ = 'r';
6008 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006009
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006010 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006011 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006013 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006014 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6015 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006016 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006017
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 /* Copy everything else as-is */
6019 else
6020 *p++ = (char) ch;
6021 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006023 assert(p - PyBytes_AS_STRING(repr) > 0);
6024 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6025 return NULL;
6026 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027}
6028
Alexander Belopolsky40018472011-02-26 01:02:56 +00006029PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006030PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6031 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006033 PyObject *result;
6034 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6035 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006037 result = PyUnicode_AsUnicodeEscapeString(tmp);
6038 Py_DECREF(tmp);
6039 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040}
6041
6042/* --- Raw Unicode Escape Codec ------------------------------------------- */
6043
Alexander Belopolsky40018472011-02-26 01:02:56 +00006044PyObject *
6045PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006046 Py_ssize_t size,
6047 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006050 Py_ssize_t startinpos;
6051 Py_ssize_t endinpos;
6052 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006053 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 const char *end;
6055 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 PyObject *errorHandler = NULL;
6057 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006058
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 /* Escaped strings will always be longer than the resulting
6060 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061 length after conversion to the true value. (But decoding error
6062 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006063 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006067 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006068 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 end = s + size;
6070 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 unsigned char c;
6072 Py_UCS4 x;
6073 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006074 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 /* Non-escape characters are interpreted as Unicode ordinals */
6077 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006078 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6079 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006081 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 startinpos = s-starts;
6083
6084 /* \u-escapes are only interpreted iff the number of leading
6085 backslashes if odd */
6086 bs = s;
6087 for (;s < end;) {
6088 if (*s != '\\')
6089 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006090 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6091 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006092 }
6093 if (((s - bs) & 1) == 0 ||
6094 s >= end ||
6095 (*s != 'u' && *s != 'U')) {
6096 continue;
6097 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006098 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 count = *s=='u' ? 4 : 8;
6100 s++;
6101
6102 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 for (x = 0, i = 0; i < count; ++i, ++s) {
6104 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006105 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006106 endinpos = s-starts;
6107 if (unicode_decode_call_errorhandler(
6108 errors, &errorHandler,
6109 "rawunicodeescape", "truncated \\uXXXX",
6110 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006111 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006112 goto onError;
6113 goto nextByte;
6114 }
6115 x = (x<<4) & ~0xF;
6116 if (c >= '0' && c <= '9')
6117 x += c - '0';
6118 else if (c >= 'a' && c <= 'f')
6119 x += 10 + c - 'a';
6120 else
6121 x += 10 + c - 'A';
6122 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006123 if (x <= 0x10ffff) {
6124 if (unicode_putchar(&v, &outpos, x) < 0)
6125 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006126 } else {
6127 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006128 if (unicode_decode_call_errorhandler(
6129 errors, &errorHandler,
6130 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006132 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 nextByte:
6136 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006138 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006140 Py_XDECREF(errorHandler);
6141 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006142 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006143
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006146 Py_XDECREF(errorHandler);
6147 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 return NULL;
6149}
6150
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006151
Alexander Belopolsky40018472011-02-26 01:02:56 +00006152PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006153PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006155 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 char *p;
6157 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 Py_ssize_t expandsize, pos;
6159 int kind;
6160 void *data;
6161 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 if (!PyUnicode_Check(unicode)) {
6164 PyErr_BadArgument();
6165 return NULL;
6166 }
6167 if (PyUnicode_READY(unicode) < 0)
6168 return NULL;
6169 kind = PyUnicode_KIND(unicode);
6170 data = PyUnicode_DATA(unicode);
6171 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006172
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 switch(kind) {
6174 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6175 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6176 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6177 }
Victor Stinner0e368262011-11-10 20:12:49 +01006178
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006181
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183 if (repr == NULL)
6184 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006188 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006189 for (pos = 0; pos < len; pos++) {
6190 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 /* Map 32-bit characters to '\Uxxxxxxxx' */
6192 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006193 *p++ = '\\';
6194 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006195 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6196 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6198 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6199 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6200 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6201 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6202 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006203 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006205 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 *p++ = '\\';
6207 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006208 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6209 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6210 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6211 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 /* Copy everything else as-is */
6214 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 *p++ = (char) ch;
6216 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006217
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006218 assert(p > q);
6219 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006220 return NULL;
6221 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222}
6223
Alexander Belopolsky40018472011-02-26 01:02:56 +00006224PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006225PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6226 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006228 PyObject *result;
6229 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6230 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006231 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006232 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6233 Py_DECREF(tmp);
6234 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235}
6236
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006237/* --- Unicode Internal Codec ------------------------------------------- */
6238
Alexander Belopolsky40018472011-02-26 01:02:56 +00006239PyObject *
6240_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006241 Py_ssize_t size,
6242 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006243{
6244 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006245 Py_ssize_t startinpos;
6246 Py_ssize_t endinpos;
6247 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006248 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006249 const char *end;
6250 const char *reason;
6251 PyObject *errorHandler = NULL;
6252 PyObject *exc = NULL;
6253
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006254 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006255 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006256 1))
6257 return NULL;
6258
Thomas Wouters89f507f2006-12-13 04:49:30 +00006259 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006260 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006261 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006263 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006264 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006265 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006266 end = s + size;
6267
6268 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006269 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006270 Py_UCS4 ch;
6271 /* We copy the raw representation one byte at a time because the
6272 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006273 ((char *) &uch)[0] = s[0];
6274 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006275#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006276 ((char *) &uch)[2] = s[2];
6277 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006278#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006279 ch = uch;
6280
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006281 /* We have to sanity check the raw data, otherwise doom looms for
6282 some malformed UCS-4 data. */
6283 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006284#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006285 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006286#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006287 end-s < Py_UNICODE_SIZE
6288 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006290 startinpos = s - starts;
6291 if (end-s < Py_UNICODE_SIZE) {
6292 endinpos = end-starts;
6293 reason = "truncated input";
6294 }
6295 else {
6296 endinpos = s - starts + Py_UNICODE_SIZE;
6297 reason = "illegal code point (> 0x10FFFF)";
6298 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006299 if (unicode_decode_call_errorhandler(
6300 errors, &errorHandler,
6301 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006302 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006303 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006304 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006305 continue;
6306 }
6307
6308 s += Py_UNICODE_SIZE;
6309#ifndef Py_UNICODE_WIDE
6310 if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6311 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006312 Py_UNICODE uch2;
6313 ((char *) &uch2)[0] = s[0];
6314 ((char *) &uch2)[1] = s[1];
6315 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006316 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006317 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006318 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006319 }
6320 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006321#endif
6322
6323 if (unicode_putchar(&v, &outpos, ch) < 0)
6324 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006325 }
6326
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006327 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006328 goto onError;
6329 Py_XDECREF(errorHandler);
6330 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006331 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006332
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006334 Py_XDECREF(v);
6335 Py_XDECREF(errorHandler);
6336 Py_XDECREF(exc);
6337 return NULL;
6338}
6339
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340/* --- Latin-1 Codec ------------------------------------------------------ */
6341
Alexander Belopolsky40018472011-02-26 01:02:56 +00006342PyObject *
6343PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006344 Py_ssize_t size,
6345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006348 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006349}
6350
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352static void
6353make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006354 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006355 PyObject *unicode,
6356 Py_ssize_t startpos, Py_ssize_t endpos,
6357 const char *reason)
6358{
6359 if (*exceptionObject == NULL) {
6360 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006361 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006362 encoding, unicode, startpos, endpos, reason);
6363 }
6364 else {
6365 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6366 goto onError;
6367 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6368 goto onError;
6369 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6370 goto onError;
6371 return;
6372 onError:
6373 Py_DECREF(*exceptionObject);
6374 *exceptionObject = NULL;
6375 }
6376}
6377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006378/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006379static void
6380raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006381 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006382 PyObject *unicode,
6383 Py_ssize_t startpos, Py_ssize_t endpos,
6384 const char *reason)
6385{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006386 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006387 encoding, unicode, startpos, endpos, reason);
6388 if (*exceptionObject != NULL)
6389 PyCodec_StrictErrors(*exceptionObject);
6390}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006391
6392/* error handling callback helper:
6393 build arguments, call the callback and check the arguments,
6394 put the result into newpos and return the replacement string, which
6395 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006396static PyObject *
6397unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006398 PyObject **errorHandler,
6399 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006400 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006401 Py_ssize_t startpos, Py_ssize_t endpos,
6402 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006403{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006404 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006405 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006406 PyObject *restuple;
6407 PyObject *resunicode;
6408
6409 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413 }
6414
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006415 if (PyUnicode_READY(unicode) < 0)
6416 return NULL;
6417 len = PyUnicode_GET_LENGTH(unicode);
6418
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006419 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006421 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423
6424 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006429 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 Py_DECREF(restuple);
6431 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006433 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 &resunicode, newpos)) {
6435 Py_DECREF(restuple);
6436 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006438 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6439 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6440 Py_DECREF(restuple);
6441 return NULL;
6442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006444 *newpos = len + *newpos;
6445 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6447 Py_DECREF(restuple);
6448 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006449 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450 Py_INCREF(resunicode);
6451 Py_DECREF(restuple);
6452 return resunicode;
6453}
6454
Alexander Belopolsky40018472011-02-26 01:02:56 +00006455static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006456unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006457 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006458 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006460 /* input state */
6461 Py_ssize_t pos=0, size;
6462 int kind;
6463 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 /* output object */
6465 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466 /* pointer into the output */
6467 char *str;
6468 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006469 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006470 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6471 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006472 PyObject *errorHandler = NULL;
6473 PyObject *exc = NULL;
6474 /* the following variable is used for caching string comparisons
6475 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6476 int known_errorHandler = -1;
6477
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006478 if (PyUnicode_READY(unicode) < 0)
6479 return NULL;
6480 size = PyUnicode_GET_LENGTH(unicode);
6481 kind = PyUnicode_KIND(unicode);
6482 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 /* allocate enough for a simple encoding without
6484 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006485 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006486 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006487 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006489 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006490 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491 ressize = size;
6492
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006493 while (pos < size) {
6494 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006495
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 /* can we encode this? */
6497 if (c<limit) {
6498 /* no overflow check, because we know that the space is enough */
6499 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006500 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006501 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 Py_ssize_t requiredsize;
6504 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 Py_ssize_t collstart = pos;
6508 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 ++collend;
6512 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6513 if (known_errorHandler==-1) {
6514 if ((errors==NULL) || (!strcmp(errors, "strict")))
6515 known_errorHandler = 1;
6516 else if (!strcmp(errors, "replace"))
6517 known_errorHandler = 2;
6518 else if (!strcmp(errors, "ignore"))
6519 known_errorHandler = 3;
6520 else if (!strcmp(errors, "xmlcharrefreplace"))
6521 known_errorHandler = 4;
6522 else
6523 known_errorHandler = 0;
6524 }
6525 switch (known_errorHandler) {
6526 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006527 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 goto onError;
6529 case 2: /* replace */
6530 while (collstart++<collend)
6531 *str++ = '?'; /* fall through */
6532 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006534 break;
6535 case 4: /* xmlcharrefreplace */
6536 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537 /* determine replacement size */
6538 for (i = collstart, repsize = 0; i < collend; ++i) {
6539 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6540 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006542 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006544 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006546 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006548#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 else
6550 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006551#else
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006552 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006553 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006554 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006555 repsize += 2+6+1;
6556 else
6557 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006558#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006560 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 if (requiredsize > ressize) {
6562 if (requiredsize<2*ressize)
6563 requiredsize = 2*ressize;
6564 if (_PyBytes_Resize(&res, requiredsize))
6565 goto onError;
6566 str = PyBytes_AS_STRING(res) + respos;
6567 ressize = requiredsize;
6568 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006569 /* generate replacement */
6570 for (i = collstart; i < collend; ++i) {
6571 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006573 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 break;
6575 default:
6576 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006577 encoding, reason, unicode, &exc,
6578 collstart, collend, &newpos);
6579 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6580 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006582 if (PyBytes_Check(repunicode)) {
6583 /* Directly copy bytes result to output. */
6584 repsize = PyBytes_Size(repunicode);
6585 if (repsize > 1) {
6586 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006587 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006588 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6589 Py_DECREF(repunicode);
6590 goto onError;
6591 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006592 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006593 ressize += repsize-1;
6594 }
6595 memcpy(str, PyBytes_AsString(repunicode), repsize);
6596 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006597 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006598 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006599 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006600 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 /* need more space? (at least enough for what we
6602 have+the replacement+the rest of the string, so
6603 we won't have to check space for encodable characters) */
6604 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 repsize = PyUnicode_GET_LENGTH(repunicode);
6606 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 if (requiredsize > ressize) {
6608 if (requiredsize<2*ressize)
6609 requiredsize = 2*ressize;
6610 if (_PyBytes_Resize(&res, requiredsize)) {
6611 Py_DECREF(repunicode);
6612 goto onError;
6613 }
6614 str = PyBytes_AS_STRING(res) + respos;
6615 ressize = requiredsize;
6616 }
6617 /* check if there is anything unencodable in the replacement
6618 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006619 for (i = 0; repsize-->0; ++i, ++str) {
6620 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006622 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006623 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 Py_DECREF(repunicode);
6625 goto onError;
6626 }
6627 *str = (char)c;
6628 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006629 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006630 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006631 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006632 }
6633 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006634 /* Resize if we allocated to much */
6635 size = str - PyBytes_AS_STRING(res);
6636 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006637 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006638 if (_PyBytes_Resize(&res, size) < 0)
6639 goto onError;
6640 }
6641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006642 Py_XDECREF(errorHandler);
6643 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006644 return res;
6645
6646 onError:
6647 Py_XDECREF(res);
6648 Py_XDECREF(errorHandler);
6649 Py_XDECREF(exc);
6650 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006651}
6652
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006653/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006654PyObject *
6655PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006656 Py_ssize_t size,
6657 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006659 PyObject *result;
6660 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6661 if (unicode == NULL)
6662 return NULL;
6663 result = unicode_encode_ucs1(unicode, errors, 256);
6664 Py_DECREF(unicode);
6665 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666}
6667
Alexander Belopolsky40018472011-02-26 01:02:56 +00006668PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006669_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670{
6671 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 PyErr_BadArgument();
6673 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006675 if (PyUnicode_READY(unicode) == -1)
6676 return NULL;
6677 /* Fast path: if it is a one-byte string, construct
6678 bytes object directly. */
6679 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6680 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6681 PyUnicode_GET_LENGTH(unicode));
6682 /* Non-Latin-1 characters present. Defer to above function to
6683 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006684 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006685}
6686
6687PyObject*
6688PyUnicode_AsLatin1String(PyObject *unicode)
6689{
6690 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691}
6692
6693/* --- 7-bit ASCII Codec -------------------------------------------------- */
6694
Alexander Belopolsky40018472011-02-26 01:02:56 +00006695PyObject *
6696PyUnicode_DecodeASCII(const char *s,
6697 Py_ssize_t size,
6698 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006699{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006701 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006702 int kind;
6703 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006704 Py_ssize_t startinpos;
6705 Py_ssize_t endinpos;
6706 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006708 int has_error;
6709 const unsigned char *p = (const unsigned char *)s;
6710 const unsigned char *end = p + size;
6711 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712 PyObject *errorHandler = NULL;
6713 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006715 if (size == 0) {
6716 Py_INCREF(unicode_empty);
6717 return unicode_empty;
6718 }
6719
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006721 if (size == 1 && (unsigned char)s[0] < 128)
6722 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006723
Victor Stinner702c7342011-10-05 13:50:52 +02006724 has_error = 0;
6725 while (p < end && !has_error) {
6726 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6727 an explanation. */
6728 if (!((size_t) p & LONG_PTR_MASK)) {
6729 /* Help register allocation */
6730 register const unsigned char *_p = p;
6731 while (_p < aligned_end) {
6732 unsigned long value = *(unsigned long *) _p;
6733 if (value & ASCII_CHAR_MASK) {
6734 has_error = 1;
6735 break;
6736 }
6737 _p += SIZEOF_LONG;
6738 }
6739 if (_p == end)
6740 break;
6741 if (has_error)
6742 break;
6743 p = _p;
6744 }
6745 if (*p & 0x80) {
6746 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006747 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006748 }
6749 else {
6750 ++p;
6751 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006752 }
Victor Stinner702c7342011-10-05 13:50:52 +02006753 if (!has_error)
6754 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006755
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006756 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006758 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006760 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006761 kind = PyUnicode_KIND(v);
6762 data = PyUnicode_DATA(v);
6763 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006764 e = s + size;
6765 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 register unsigned char c = (unsigned char)*s;
6767 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006768 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006769 ++s;
6770 }
6771 else {
6772 startinpos = s-starts;
6773 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 if (unicode_decode_call_errorhandler(
6775 errors, &errorHandler,
6776 "ascii", "ordinal not in range(128)",
6777 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006778 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006780 kind = PyUnicode_KIND(v);
6781 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006784 if (PyUnicode_Resize(&v, outpos) < 0)
6785 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006786 Py_XDECREF(errorHandler);
6787 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006788 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006789 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006790
Benjamin Peterson29060642009-01-31 22:14:21 +00006791 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006793 Py_XDECREF(errorHandler);
6794 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795 return NULL;
6796}
6797
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006798/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006799PyObject *
6800PyUnicode_EncodeASCII(const Py_UNICODE *p,
6801 Py_ssize_t size,
6802 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006804 PyObject *result;
6805 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6806 if (unicode == NULL)
6807 return NULL;
6808 result = unicode_encode_ucs1(unicode, errors, 128);
6809 Py_DECREF(unicode);
6810 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811}
6812
Alexander Belopolsky40018472011-02-26 01:02:56 +00006813PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006814_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815{
6816 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 PyErr_BadArgument();
6818 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006820 if (PyUnicode_READY(unicode) == -1)
6821 return NULL;
6822 /* Fast path: if it is an ASCII-only string, construct bytes object
6823 directly. Else defer to above function to raise the exception. */
6824 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6825 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6826 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006827 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006828}
6829
6830PyObject *
6831PyUnicode_AsASCIIString(PyObject *unicode)
6832{
6833 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006834}
6835
Victor Stinner99b95382011-07-04 14:23:54 +02006836#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006837
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006838/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006839
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006840#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006841#define NEED_RETRY
6842#endif
6843
Victor Stinner3a50e702011-10-18 21:21:00 +02006844#ifndef WC_ERR_INVALID_CHARS
6845# define WC_ERR_INVALID_CHARS 0x0080
6846#endif
6847
6848static char*
6849code_page_name(UINT code_page, PyObject **obj)
6850{
6851 *obj = NULL;
6852 if (code_page == CP_ACP)
6853 return "mbcs";
6854 if (code_page == CP_UTF7)
6855 return "CP_UTF7";
6856 if (code_page == CP_UTF8)
6857 return "CP_UTF8";
6858
6859 *obj = PyBytes_FromFormat("cp%u", code_page);
6860 if (*obj == NULL)
6861 return NULL;
6862 return PyBytes_AS_STRING(*obj);
6863}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006864
Alexander Belopolsky40018472011-02-26 01:02:56 +00006865static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006866is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867{
6868 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006869 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006870
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 if (!IsDBCSLeadByteEx(code_page, *curr))
6872 return 0;
6873
6874 prev = CharPrevExA(code_page, s, curr, 0);
6875 if (prev == curr)
6876 return 1;
6877 /* FIXME: This code is limited to "true" double-byte encodings,
6878 as it assumes an incomplete character consists of a single
6879 byte. */
6880 if (curr - prev == 2)
6881 return 1;
6882 if (!IsDBCSLeadByteEx(code_page, *prev))
6883 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006884 return 0;
6885}
6886
Victor Stinner3a50e702011-10-18 21:21:00 +02006887static DWORD
6888decode_code_page_flags(UINT code_page)
6889{
6890 if (code_page == CP_UTF7) {
6891 /* The CP_UTF7 decoder only supports flags=0 */
6892 return 0;
6893 }
6894 else
6895 return MB_ERR_INVALID_CHARS;
6896}
6897
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006898/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006899 * Decode a byte string from a Windows code page into unicode object in strict
6900 * mode.
6901 *
6902 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6903 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006905static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006906decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006907 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 const char *in,
6909 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006910{
Victor Stinner3a50e702011-10-18 21:21:00 +02006911 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006912 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006913 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006914
6915 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006916 assert(insize > 0);
6917 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6918 if (outsize <= 0)
6919 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006920
6921 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006923 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 if (*v == NULL)
6925 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006926 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006927 }
6928 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006930 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006931 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006933 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934 }
6935
6936 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006937 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6938 if (outsize <= 0)
6939 goto error;
6940 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006941
Victor Stinner3a50e702011-10-18 21:21:00 +02006942error:
6943 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6944 return -2;
6945 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006946 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006947}
6948
Victor Stinner3a50e702011-10-18 21:21:00 +02006949/*
6950 * Decode a byte string from a code page into unicode object with an error
6951 * handler.
6952 *
6953 * Returns consumed size if succeed, or raise a WindowsError or
6954 * UnicodeDecodeError exception and returns -1 on error.
6955 */
6956static int
6957decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006958 PyObject **v,
6959 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006960 const char *errors)
6961{
6962 const char *startin = in;
6963 const char *endin = in + size;
6964 const DWORD flags = decode_code_page_flags(code_page);
6965 /* Ideally, we should get reason from FormatMessage. This is the Windows
6966 2000 English version of the message. */
6967 const char *reason = "No mapping for the Unicode character exists "
6968 "in the target code page.";
6969 /* each step cannot decode more than 1 character, but a character can be
6970 represented as a surrogate pair */
6971 wchar_t buffer[2], *startout, *out;
6972 int insize, outsize;
6973 PyObject *errorHandler = NULL;
6974 PyObject *exc = NULL;
6975 PyObject *encoding_obj = NULL;
6976 char *encoding;
6977 DWORD err;
6978 int ret = -1;
6979
6980 assert(size > 0);
6981
6982 encoding = code_page_name(code_page, &encoding_obj);
6983 if (encoding == NULL)
6984 return -1;
6985
6986 if (errors == NULL || strcmp(errors, "strict") == 0) {
6987 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6988 UnicodeDecodeError. */
6989 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6990 if (exc != NULL) {
6991 PyCodec_StrictErrors(exc);
6992 Py_CLEAR(exc);
6993 }
6994 goto error;
6995 }
6996
6997 if (*v == NULL) {
6998 /* Create unicode object */
6999 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7000 PyErr_NoMemory();
7001 goto error;
7002 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007003 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007004 if (*v == NULL)
7005 goto error;
7006 startout = PyUnicode_AS_UNICODE(*v);
7007 }
7008 else {
7009 /* Extend unicode object */
7010 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7011 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7012 PyErr_NoMemory();
7013 goto error;
7014 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007015 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007016 goto error;
7017 startout = PyUnicode_AS_UNICODE(*v) + n;
7018 }
7019
7020 /* Decode the byte string character per character */
7021 out = startout;
7022 while (in < endin)
7023 {
7024 /* Decode a character */
7025 insize = 1;
7026 do
7027 {
7028 outsize = MultiByteToWideChar(code_page, flags,
7029 in, insize,
7030 buffer, Py_ARRAY_LENGTH(buffer));
7031 if (outsize > 0)
7032 break;
7033 err = GetLastError();
7034 if (err != ERROR_NO_UNICODE_TRANSLATION
7035 && err != ERROR_INSUFFICIENT_BUFFER)
7036 {
7037 PyErr_SetFromWindowsErr(0);
7038 goto error;
7039 }
7040 insize++;
7041 }
7042 /* 4=maximum length of a UTF-8 sequence */
7043 while (insize <= 4 && (in + insize) <= endin);
7044
7045 if (outsize <= 0) {
7046 Py_ssize_t startinpos, endinpos, outpos;
7047
7048 startinpos = in - startin;
7049 endinpos = startinpos + 1;
7050 outpos = out - PyUnicode_AS_UNICODE(*v);
7051 if (unicode_decode_call_errorhandler(
7052 errors, &errorHandler,
7053 encoding, reason,
7054 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007055 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007056 {
7057 goto error;
7058 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007059 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007060 }
7061 else {
7062 in += insize;
7063 memcpy(out, buffer, outsize * sizeof(wchar_t));
7064 out += outsize;
7065 }
7066 }
7067
7068 /* write a NUL character at the end */
7069 *out = 0;
7070
7071 /* Extend unicode object */
7072 outsize = out - startout;
7073 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007074 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007076 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007077
7078error:
7079 Py_XDECREF(encoding_obj);
7080 Py_XDECREF(errorHandler);
7081 Py_XDECREF(exc);
7082 return ret;
7083}
7084
Victor Stinner3a50e702011-10-18 21:21:00 +02007085static PyObject *
7086decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007087 const char *s, Py_ssize_t size,
7088 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089{
Victor Stinner76a31a62011-11-04 00:05:13 +01007090 PyObject *v = NULL;
7091 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007092
Victor Stinner3a50e702011-10-18 21:21:00 +02007093 if (code_page < 0) {
7094 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7095 return NULL;
7096 }
7097
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007098 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007099 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100
Victor Stinner76a31a62011-11-04 00:05:13 +01007101 do
7102 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007103#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007104 if (size > INT_MAX) {
7105 chunk_size = INT_MAX;
7106 final = 0;
7107 done = 0;
7108 }
7109 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007111 {
7112 chunk_size = (int)size;
7113 final = (consumed == NULL);
7114 done = 1;
7115 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007116
Victor Stinner76a31a62011-11-04 00:05:13 +01007117 /* Skip trailing lead-byte unless 'final' is set */
7118 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7119 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007120
Victor Stinner76a31a62011-11-04 00:05:13 +01007121 if (chunk_size == 0 && done) {
7122 if (v != NULL)
7123 break;
7124 Py_INCREF(unicode_empty);
7125 return unicode_empty;
7126 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007127
Victor Stinner76a31a62011-11-04 00:05:13 +01007128
7129 converted = decode_code_page_strict(code_page, &v,
7130 s, chunk_size);
7131 if (converted == -2)
7132 converted = decode_code_page_errors(code_page, &v,
7133 s, chunk_size,
7134 errors);
7135 assert(converted != 0);
7136
7137 if (converted < 0) {
7138 Py_XDECREF(v);
7139 return NULL;
7140 }
7141
7142 if (consumed)
7143 *consumed += converted;
7144
7145 s += converted;
7146 size -= converted;
7147 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007148
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007149 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150}
7151
Alexander Belopolsky40018472011-02-26 01:02:56 +00007152PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007153PyUnicode_DecodeCodePageStateful(int code_page,
7154 const char *s,
7155 Py_ssize_t size,
7156 const char *errors,
7157 Py_ssize_t *consumed)
7158{
7159 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7160}
7161
7162PyObject *
7163PyUnicode_DecodeMBCSStateful(const char *s,
7164 Py_ssize_t size,
7165 const char *errors,
7166 Py_ssize_t *consumed)
7167{
7168 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7169}
7170
7171PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007172PyUnicode_DecodeMBCS(const char *s,
7173 Py_ssize_t size,
7174 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007175{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007176 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7177}
7178
Victor Stinner3a50e702011-10-18 21:21:00 +02007179static DWORD
7180encode_code_page_flags(UINT code_page, const char *errors)
7181{
7182 if (code_page == CP_UTF8) {
7183 if (winver.dwMajorVersion >= 6)
7184 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7185 and later */
7186 return WC_ERR_INVALID_CHARS;
7187 else
7188 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7189 return 0;
7190 }
7191 else if (code_page == CP_UTF7) {
7192 /* CP_UTF7 only supports flags=0 */
7193 return 0;
7194 }
7195 else {
7196 if (errors != NULL && strcmp(errors, "replace") == 0)
7197 return 0;
7198 else
7199 return WC_NO_BEST_FIT_CHARS;
7200 }
7201}
7202
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007203/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 * Encode a Unicode string to a Windows code page into a byte string in strict
7205 * mode.
7206 *
7207 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7208 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007209 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007210static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007211encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007212 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214{
Victor Stinner554f3f02010-06-16 23:33:54 +00007215 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 BOOL *pusedDefaultChar = &usedDefaultChar;
7217 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007218 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007219 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 const DWORD flags = encode_code_page_flags(code_page, NULL);
7222 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007223 /* Create a substring so that we can get the UTF-16 representation
7224 of just the slice under consideration. */
7225 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007226
Martin v. Löwis3d325192011-11-04 18:23:06 +01007227 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007228
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007230 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007231 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007232 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007233
Victor Stinner2fc507f2011-11-04 20:06:39 +01007234 substring = PyUnicode_Substring(unicode, offset, offset+len);
7235 if (substring == NULL)
7236 return -1;
7237 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7238 if (p == NULL) {
7239 Py_DECREF(substring);
7240 return -1;
7241 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007242
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007243 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007244 outsize = WideCharToMultiByte(code_page, flags,
7245 p, size,
7246 NULL, 0,
7247 NULL, pusedDefaultChar);
7248 if (outsize <= 0)
7249 goto error;
7250 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007251 if (pusedDefaultChar && *pusedDefaultChar) {
7252 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007254 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007255
Victor Stinner3a50e702011-10-18 21:21:00 +02007256 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007257 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007259 if (*outbytes == NULL) {
7260 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007261 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007262 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007263 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007264 }
7265 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 const Py_ssize_t n = PyBytes_Size(*outbytes);
7268 if (outsize > PY_SSIZE_T_MAX - n) {
7269 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007270 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007271 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007273 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7274 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007276 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278 }
7279
7280 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007281 outsize = WideCharToMultiByte(code_page, flags,
7282 p, size,
7283 out, outsize,
7284 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007285 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 if (outsize <= 0)
7287 goto error;
7288 if (pusedDefaultChar && *pusedDefaultChar)
7289 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007291
Victor Stinner3a50e702011-10-18 21:21:00 +02007292error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007293 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7295 return -2;
7296 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007297 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007298}
7299
Victor Stinner3a50e702011-10-18 21:21:00 +02007300/*
7301 * Encode a Unicode string to a Windows code page into a byte string using a
7302 * error handler.
7303 *
7304 * Returns consumed characters if succeed, or raise a WindowsError and returns
7305 * -1 on other error.
7306 */
7307static int
7308encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007309 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007310 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007311{
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007313 Py_ssize_t pos = unicode_offset;
7314 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 /* Ideally, we should get reason from FormatMessage. This is the Windows
7316 2000 English version of the message. */
7317 const char *reason = "invalid character";
7318 /* 4=maximum length of a UTF-8 sequence */
7319 char buffer[4];
7320 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7321 Py_ssize_t outsize;
7322 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007323 PyObject *errorHandler = NULL;
7324 PyObject *exc = NULL;
7325 PyObject *encoding_obj = NULL;
7326 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007327 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 PyObject *rep;
7329 int ret = -1;
7330
7331 assert(insize > 0);
7332
7333 encoding = code_page_name(code_page, &encoding_obj);
7334 if (encoding == NULL)
7335 return -1;
7336
7337 if (errors == NULL || strcmp(errors, "strict") == 0) {
7338 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7339 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007340 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007341 if (exc != NULL) {
7342 PyCodec_StrictErrors(exc);
7343 Py_DECREF(exc);
7344 }
7345 Py_XDECREF(encoding_obj);
7346 return -1;
7347 }
7348
7349 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7350 pusedDefaultChar = &usedDefaultChar;
7351 else
7352 pusedDefaultChar = NULL;
7353
7354 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7355 PyErr_NoMemory();
7356 goto error;
7357 }
7358 outsize = insize * Py_ARRAY_LENGTH(buffer);
7359
7360 if (*outbytes == NULL) {
7361 /* Create string object */
7362 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7363 if (*outbytes == NULL)
7364 goto error;
7365 out = PyBytes_AS_STRING(*outbytes);
7366 }
7367 else {
7368 /* Extend string object */
7369 Py_ssize_t n = PyBytes_Size(*outbytes);
7370 if (n > PY_SSIZE_T_MAX - outsize) {
7371 PyErr_NoMemory();
7372 goto error;
7373 }
7374 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7375 goto error;
7376 out = PyBytes_AS_STRING(*outbytes) + n;
7377 }
7378
7379 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007380 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007381 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007382 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7383 wchar_t chars[2];
7384 int charsize;
7385 if (ch < 0x10000) {
7386 chars[0] = (wchar_t)ch;
7387 charsize = 1;
7388 }
7389 else {
7390 ch -= 0x10000;
7391 chars[0] = 0xd800 + (ch >> 10);
7392 chars[1] = 0xdc00 + (ch & 0x3ff);
7393 charsize = 2;
7394 }
7395
Victor Stinner3a50e702011-10-18 21:21:00 +02007396 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007397 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007398 buffer, Py_ARRAY_LENGTH(buffer),
7399 NULL, pusedDefaultChar);
7400 if (outsize > 0) {
7401 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7402 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007403 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 memcpy(out, buffer, outsize);
7405 out += outsize;
7406 continue;
7407 }
7408 }
7409 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7410 PyErr_SetFromWindowsErr(0);
7411 goto error;
7412 }
7413
Victor Stinner3a50e702011-10-18 21:21:00 +02007414 rep = unicode_encode_call_errorhandler(
7415 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007416 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007417 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007418 if (rep == NULL)
7419 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007420 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007421
7422 if (PyBytes_Check(rep)) {
7423 outsize = PyBytes_GET_SIZE(rep);
7424 if (outsize != 1) {
7425 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7426 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7427 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7428 Py_DECREF(rep);
7429 goto error;
7430 }
7431 out = PyBytes_AS_STRING(*outbytes) + offset;
7432 }
7433 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7434 out += outsize;
7435 }
7436 else {
7437 Py_ssize_t i;
7438 enum PyUnicode_Kind kind;
7439 void *data;
7440
7441 if (PyUnicode_READY(rep) < 0) {
7442 Py_DECREF(rep);
7443 goto error;
7444 }
7445
7446 outsize = PyUnicode_GET_LENGTH(rep);
7447 if (outsize != 1) {
7448 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7449 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7450 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7451 Py_DECREF(rep);
7452 goto error;
7453 }
7454 out = PyBytes_AS_STRING(*outbytes) + offset;
7455 }
7456 kind = PyUnicode_KIND(rep);
7457 data = PyUnicode_DATA(rep);
7458 for (i=0; i < outsize; i++) {
7459 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7460 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007461 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007462 encoding, unicode,
7463 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 "unable to encode error handler result to ASCII");
7465 Py_DECREF(rep);
7466 goto error;
7467 }
7468 *out = (unsigned char)ch;
7469 out++;
7470 }
7471 }
7472 Py_DECREF(rep);
7473 }
7474 /* write a NUL byte */
7475 *out = 0;
7476 outsize = out - PyBytes_AS_STRING(*outbytes);
7477 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7478 if (_PyBytes_Resize(outbytes, outsize) < 0)
7479 goto error;
7480 ret = 0;
7481
7482error:
7483 Py_XDECREF(encoding_obj);
7484 Py_XDECREF(errorHandler);
7485 Py_XDECREF(exc);
7486 return ret;
7487}
7488
Victor Stinner3a50e702011-10-18 21:21:00 +02007489static PyObject *
7490encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007491 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 const char *errors)
7493{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007494 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007496 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007497 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007498
Victor Stinner2fc507f2011-11-04 20:06:39 +01007499 if (PyUnicode_READY(unicode) < 0)
7500 return NULL;
7501 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007502
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 if (code_page < 0) {
7504 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7505 return NULL;
7506 }
7507
Martin v. Löwis3d325192011-11-04 18:23:06 +01007508 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007509 return PyBytes_FromStringAndSize(NULL, 0);
7510
Victor Stinner7581cef2011-11-03 22:32:33 +01007511 offset = 0;
7512 do
7513 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007515 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007516 chunks. */
7517 if (len > INT_MAX/2) {
7518 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007519 done = 0;
7520 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007521 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007522#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007523 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007524 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007525 done = 1;
7526 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007527
Victor Stinner76a31a62011-11-04 00:05:13 +01007528 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007529 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007530 errors);
7531 if (ret == -2)
7532 ret = encode_code_page_errors(code_page, &outbytes,
7533 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007534 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007535 if (ret < 0) {
7536 Py_XDECREF(outbytes);
7537 return NULL;
7538 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007539
Victor Stinner7581cef2011-11-03 22:32:33 +01007540 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007541 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007542 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007543
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 return outbytes;
7545}
7546
7547PyObject *
7548PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7549 Py_ssize_t size,
7550 const char *errors)
7551{
Victor Stinner7581cef2011-11-03 22:32:33 +01007552 PyObject *unicode, *res;
7553 unicode = PyUnicode_FromUnicode(p, size);
7554 if (unicode == NULL)
7555 return NULL;
7556 res = encode_code_page(CP_ACP, unicode, errors);
7557 Py_DECREF(unicode);
7558 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007559}
7560
7561PyObject *
7562PyUnicode_EncodeCodePage(int code_page,
7563 PyObject *unicode,
7564 const char *errors)
7565{
Victor Stinner7581cef2011-11-03 22:32:33 +01007566 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007567}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007568
Alexander Belopolsky40018472011-02-26 01:02:56 +00007569PyObject *
7570PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007571{
7572 if (!PyUnicode_Check(unicode)) {
7573 PyErr_BadArgument();
7574 return NULL;
7575 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007576 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007577}
7578
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007579#undef NEED_RETRY
7580
Victor Stinner99b95382011-07-04 14:23:54 +02007581#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007582
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583/* --- Character Mapping Codec -------------------------------------------- */
7584
Alexander Belopolsky40018472011-02-26 01:02:56 +00007585PyObject *
7586PyUnicode_DecodeCharmap(const char *s,
7587 Py_ssize_t size,
7588 PyObject *mapping,
7589 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007591 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007592 Py_ssize_t startinpos;
7593 Py_ssize_t endinpos;
7594 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007595 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007596 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007597 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007598 PyObject *errorHandler = NULL;
7599 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007600
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 /* Default to Latin-1 */
7602 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007605 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007608 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007609 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007610 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007611 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007612 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007613 Py_ssize_t maplen;
7614 enum PyUnicode_Kind kind;
7615 void *data;
7616 Py_UCS4 x;
7617
7618 if (PyUnicode_READY(mapping) < 0)
7619 return NULL;
7620
7621 maplen = PyUnicode_GET_LENGTH(mapping);
7622 data = PyUnicode_DATA(mapping);
7623 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 while (s < e) {
7625 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007626
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007628 x = PyUnicode_READ(kind, data, ch);
7629 else
7630 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007631
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007632 if (x == 0xfffe)
7633 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 startinpos = s-starts;
7636 endinpos = startinpos+1;
7637 if (unicode_decode_call_errorhandler(
7638 errors, &errorHandler,
7639 "charmap", "character maps to <undefined>",
7640 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007641 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 goto onError;
7643 }
7644 continue;
7645 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007646
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007647 if (unicode_putchar(&v, &outpos, x) < 0)
7648 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007649 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007650 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007651 }
7652 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007653 while (s < e) {
7654 unsigned char ch = *s;
7655 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007656
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7658 w = PyLong_FromLong((long)ch);
7659 if (w == NULL)
7660 goto onError;
7661 x = PyObject_GetItem(mapping, w);
7662 Py_DECREF(w);
7663 if (x == NULL) {
7664 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7665 /* No mapping found means: mapping is undefined. */
7666 PyErr_Clear();
7667 x = Py_None;
7668 Py_INCREF(x);
7669 } else
7670 goto onError;
7671 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007672
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 /* Apply mapping */
7674 if (PyLong_Check(x)) {
7675 long value = PyLong_AS_LONG(x);
7676 if (value < 0 || value > 65535) {
7677 PyErr_SetString(PyExc_TypeError,
7678 "character mapping must be in range(65536)");
7679 Py_DECREF(x);
7680 goto onError;
7681 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007682 if (unicode_putchar(&v, &outpos, value) < 0)
7683 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 }
7685 else if (x == Py_None) {
7686 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 startinpos = s-starts;
7688 endinpos = startinpos+1;
7689 if (unicode_decode_call_errorhandler(
7690 errors, &errorHandler,
7691 "charmap", "character maps to <undefined>",
7692 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007693 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 Py_DECREF(x);
7695 goto onError;
7696 }
7697 Py_DECREF(x);
7698 continue;
7699 }
7700 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007701 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007702
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007703 if (PyUnicode_READY(x) < 0)
7704 goto onError;
7705 targetsize = PyUnicode_GET_LENGTH(x);
7706
7707 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007709 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007710 PyUnicode_READ_CHAR(x, 0)) < 0)
7711 goto onError;
7712 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 else if (targetsize > 1) {
7714 /* 1-n mapping */
7715 if (targetsize > extrachars) {
7716 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007717 Py_ssize_t needed = (targetsize - extrachars) + \
7718 (targetsize << 2);
7719 extrachars += needed;
7720 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007721 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007722 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 Py_DECREF(x);
7724 goto onError;
7725 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007727 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7728 goto onError;
7729 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7730 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 extrachars -= targetsize;
7732 }
7733 /* 1-0 mapping: skip the character */
7734 }
7735 else {
7736 /* wrong return value */
7737 PyErr_SetString(PyExc_TypeError,
7738 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007739 Py_DECREF(x);
7740 goto onError;
7741 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 Py_DECREF(x);
7743 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007745 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007746 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007747 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007748 Py_XDECREF(errorHandler);
7749 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007750 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007751
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007753 Py_XDECREF(errorHandler);
7754 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007755 Py_XDECREF(v);
7756 return NULL;
7757}
7758
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007759/* Charmap encoding: the lookup table */
7760
Alexander Belopolsky40018472011-02-26 01:02:56 +00007761struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 PyObject_HEAD
7763 unsigned char level1[32];
7764 int count2, count3;
7765 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766};
7767
7768static PyObject*
7769encoding_map_size(PyObject *obj, PyObject* args)
7770{
7771 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007772 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007774}
7775
7776static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007777 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 PyDoc_STR("Return the size (in bytes) of this object") },
7779 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007780};
7781
7782static void
7783encoding_map_dealloc(PyObject* o)
7784{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007785 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007786}
7787
7788static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007789 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 "EncodingMap", /*tp_name*/
7791 sizeof(struct encoding_map), /*tp_basicsize*/
7792 0, /*tp_itemsize*/
7793 /* methods */
7794 encoding_map_dealloc, /*tp_dealloc*/
7795 0, /*tp_print*/
7796 0, /*tp_getattr*/
7797 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007798 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007799 0, /*tp_repr*/
7800 0, /*tp_as_number*/
7801 0, /*tp_as_sequence*/
7802 0, /*tp_as_mapping*/
7803 0, /*tp_hash*/
7804 0, /*tp_call*/
7805 0, /*tp_str*/
7806 0, /*tp_getattro*/
7807 0, /*tp_setattro*/
7808 0, /*tp_as_buffer*/
7809 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7810 0, /*tp_doc*/
7811 0, /*tp_traverse*/
7812 0, /*tp_clear*/
7813 0, /*tp_richcompare*/
7814 0, /*tp_weaklistoffset*/
7815 0, /*tp_iter*/
7816 0, /*tp_iternext*/
7817 encoding_map_methods, /*tp_methods*/
7818 0, /*tp_members*/
7819 0, /*tp_getset*/
7820 0, /*tp_base*/
7821 0, /*tp_dict*/
7822 0, /*tp_descr_get*/
7823 0, /*tp_descr_set*/
7824 0, /*tp_dictoffset*/
7825 0, /*tp_init*/
7826 0, /*tp_alloc*/
7827 0, /*tp_new*/
7828 0, /*tp_free*/
7829 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007830};
7831
7832PyObject*
7833PyUnicode_BuildEncodingMap(PyObject* string)
7834{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007835 PyObject *result;
7836 struct encoding_map *mresult;
7837 int i;
7838 int need_dict = 0;
7839 unsigned char level1[32];
7840 unsigned char level2[512];
7841 unsigned char *mlevel1, *mlevel2, *mlevel3;
7842 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007843 int kind;
7844 void *data;
7845 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007847 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007848 PyErr_BadArgument();
7849 return NULL;
7850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007851 kind = PyUnicode_KIND(string);
7852 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007853 memset(level1, 0xFF, sizeof level1);
7854 memset(level2, 0xFF, sizeof level2);
7855
7856 /* If there isn't a one-to-one mapping of NULL to \0,
7857 or if there are non-BMP characters, we need to use
7858 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007859 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007860 need_dict = 1;
7861 for (i = 1; i < 256; i++) {
7862 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863 ch = PyUnicode_READ(kind, data, i);
7864 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007865 need_dict = 1;
7866 break;
7867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007868 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007869 /* unmapped character */
7870 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 l1 = ch >> 11;
7872 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007873 if (level1[l1] == 0xFF)
7874 level1[l1] = count2++;
7875 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007876 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007877 }
7878
7879 if (count2 >= 0xFF || count3 >= 0xFF)
7880 need_dict = 1;
7881
7882 if (need_dict) {
7883 PyObject *result = PyDict_New();
7884 PyObject *key, *value;
7885 if (!result)
7886 return NULL;
7887 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007888 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007889 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007890 if (!key || !value)
7891 goto failed1;
7892 if (PyDict_SetItem(result, key, value) == -1)
7893 goto failed1;
7894 Py_DECREF(key);
7895 Py_DECREF(value);
7896 }
7897 return result;
7898 failed1:
7899 Py_XDECREF(key);
7900 Py_XDECREF(value);
7901 Py_DECREF(result);
7902 return NULL;
7903 }
7904
7905 /* Create a three-level trie */
7906 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7907 16*count2 + 128*count3 - 1);
7908 if (!result)
7909 return PyErr_NoMemory();
7910 PyObject_Init(result, &EncodingMapType);
7911 mresult = (struct encoding_map*)result;
7912 mresult->count2 = count2;
7913 mresult->count3 = count3;
7914 mlevel1 = mresult->level1;
7915 mlevel2 = mresult->level23;
7916 mlevel3 = mresult->level23 + 16*count2;
7917 memcpy(mlevel1, level1, 32);
7918 memset(mlevel2, 0xFF, 16*count2);
7919 memset(mlevel3, 0, 128*count3);
7920 count3 = 0;
7921 for (i = 1; i < 256; i++) {
7922 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007923 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007924 /* unmapped character */
7925 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007926 o1 = PyUnicode_READ(kind, data, i)>>11;
7927 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007928 i2 = 16*mlevel1[o1] + o2;
7929 if (mlevel2[i2] == 0xFF)
7930 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007931 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007932 i3 = 128*mlevel2[i2] + o3;
7933 mlevel3[i3] = i;
7934 }
7935 return result;
7936}
7937
7938static int
Victor Stinner22168992011-11-20 17:09:18 +01007939encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007940{
7941 struct encoding_map *map = (struct encoding_map*)mapping;
7942 int l1 = c>>11;
7943 int l2 = (c>>7) & 0xF;
7944 int l3 = c & 0x7F;
7945 int i;
7946
Victor Stinner22168992011-11-20 17:09:18 +01007947 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949 if (c == 0)
7950 return 0;
7951 /* level 1*/
7952 i = map->level1[l1];
7953 if (i == 0xFF) {
7954 return -1;
7955 }
7956 /* level 2*/
7957 i = map->level23[16*i+l2];
7958 if (i == 0xFF) {
7959 return -1;
7960 }
7961 /* level 3 */
7962 i = map->level23[16*map->count2 + 128*i + l3];
7963 if (i == 0) {
7964 return -1;
7965 }
7966 return i;
7967}
7968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969/* Lookup the character ch in the mapping. If the character
7970 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007971 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007972static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007973charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974{
Christian Heimes217cfd12007-12-02 14:31:20 +00007975 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 PyObject *x;
7977
7978 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007980 x = PyObject_GetItem(mapping, w);
7981 Py_DECREF(w);
7982 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7984 /* No mapping found means: mapping is undefined. */
7985 PyErr_Clear();
7986 x = Py_None;
7987 Py_INCREF(x);
7988 return x;
7989 } else
7990 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007992 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007993 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007994 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 long value = PyLong_AS_LONG(x);
7996 if (value < 0 || value > 255) {
7997 PyErr_SetString(PyExc_TypeError,
7998 "character mapping must be in range(256)");
7999 Py_DECREF(x);
8000 return NULL;
8001 }
8002 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008004 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 /* wrong return value */
8008 PyErr_Format(PyExc_TypeError,
8009 "character mapping must return integer, bytes or None, not %.400s",
8010 x->ob_type->tp_name);
8011 Py_DECREF(x);
8012 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008013 }
8014}
8015
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008016static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008017charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008018{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008019 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8020 /* exponentially overallocate to minimize reallocations */
8021 if (requiredsize < 2*outsize)
8022 requiredsize = 2*outsize;
8023 if (_PyBytes_Resize(outobj, requiredsize))
8024 return -1;
8025 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008026}
8027
Benjamin Peterson14339b62009-01-31 16:36:08 +00008028typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008030} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008031/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008032 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008033 space is available. Return a new reference to the object that
8034 was put in the output buffer, or Py_None, if the mapping was undefined
8035 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008036 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008037static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008038charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008039 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008040{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041 PyObject *rep;
8042 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008043 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044
Christian Heimes90aa7642007-12-19 02:45:37 +00008045 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008046 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008048 if (res == -1)
8049 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 if (outsize<requiredsize)
8051 if (charmapencode_resize(outobj, outpos, requiredsize))
8052 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008053 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 outstart[(*outpos)++] = (char)res;
8055 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008056 }
8057
8058 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008059 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008061 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 Py_DECREF(rep);
8063 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008064 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 if (PyLong_Check(rep)) {
8066 Py_ssize_t requiredsize = *outpos+1;
8067 if (outsize<requiredsize)
8068 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8069 Py_DECREF(rep);
8070 return enc_EXCEPTION;
8071 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008072 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008073 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008074 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 else {
8076 const char *repchars = PyBytes_AS_STRING(rep);
8077 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8078 Py_ssize_t requiredsize = *outpos+repsize;
8079 if (outsize<requiredsize)
8080 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8081 Py_DECREF(rep);
8082 return enc_EXCEPTION;
8083 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008084 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 memcpy(outstart + *outpos, repchars, repsize);
8086 *outpos += repsize;
8087 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008089 Py_DECREF(rep);
8090 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091}
8092
8093/* handle an error in PyUnicode_EncodeCharmap
8094 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008095static int
8096charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008097 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008099 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008100 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101{
8102 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008103 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008104 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008105 enum PyUnicode_Kind kind;
8106 void *data;
8107 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008109 Py_ssize_t collstartpos = *inpos;
8110 Py_ssize_t collendpos = *inpos+1;
8111 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008112 char *encoding = "charmap";
8113 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008115 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008116 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008117
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008118 if (PyUnicode_READY(unicode) < 0)
8119 return -1;
8120 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 /* find all unencodable characters */
8122 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008123 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008124 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008125 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008126 val = encoding_map_lookup(ch, mapping);
8127 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 break;
8129 ++collendpos;
8130 continue;
8131 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008132
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008133 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8134 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 if (rep==NULL)
8136 return -1;
8137 else if (rep!=Py_None) {
8138 Py_DECREF(rep);
8139 break;
8140 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008141 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008143 }
8144 /* cache callback name lookup
8145 * (if not done yet, i.e. it's the first error) */
8146 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008147 if ((errors==NULL) || (!strcmp(errors, "strict")))
8148 *known_errorHandler = 1;
8149 else if (!strcmp(errors, "replace"))
8150 *known_errorHandler = 2;
8151 else if (!strcmp(errors, "ignore"))
8152 *known_errorHandler = 3;
8153 else if (!strcmp(errors, "xmlcharrefreplace"))
8154 *known_errorHandler = 4;
8155 else
8156 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008157 }
8158 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008159 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008160 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008161 return -1;
8162 case 2: /* replace */
8163 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 x = charmapencode_output('?', mapping, res, respos);
8165 if (x==enc_EXCEPTION) {
8166 return -1;
8167 }
8168 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008169 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008170 return -1;
8171 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008172 }
8173 /* fall through */
8174 case 3: /* ignore */
8175 *inpos = collendpos;
8176 break;
8177 case 4: /* xmlcharrefreplace */
8178 /* generate replacement (temporarily (mis)uses p) */
8179 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 char buffer[2+29+1+1];
8181 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008182 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 for (cp = buffer; *cp; ++cp) {
8184 x = charmapencode_output(*cp, mapping, res, respos);
8185 if (x==enc_EXCEPTION)
8186 return -1;
8187 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008188 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 return -1;
8190 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 }
8192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 *inpos = collendpos;
8194 break;
8195 default:
8196 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008197 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008199 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008201 if (PyBytes_Check(repunicode)) {
8202 /* Directly copy bytes result to output. */
8203 Py_ssize_t outsize = PyBytes_Size(*res);
8204 Py_ssize_t requiredsize;
8205 repsize = PyBytes_Size(repunicode);
8206 requiredsize = *respos + repsize;
8207 if (requiredsize > outsize)
8208 /* Make room for all additional bytes. */
8209 if (charmapencode_resize(res, respos, requiredsize)) {
8210 Py_DECREF(repunicode);
8211 return -1;
8212 }
8213 memcpy(PyBytes_AsString(*res) + *respos,
8214 PyBytes_AsString(repunicode), repsize);
8215 *respos += repsize;
8216 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008217 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008218 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008219 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008220 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008221 if (PyUnicode_READY(repunicode) < 0) {
8222 Py_DECREF(repunicode);
8223 return -1;
8224 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008225 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008226 data = PyUnicode_DATA(repunicode);
8227 kind = PyUnicode_KIND(repunicode);
8228 for (index = 0; index < repsize; index++) {
8229 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8230 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008232 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 return -1;
8234 }
8235 else if (x==enc_FAILED) {
8236 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008237 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 return -1;
8239 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008240 }
8241 *inpos = newpos;
8242 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 }
8244 return 0;
8245}
8246
Alexander Belopolsky40018472011-02-26 01:02:56 +00008247PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008248_PyUnicode_EncodeCharmap(PyObject *unicode,
8249 PyObject *mapping,
8250 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008251{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 /* output object */
8253 PyObject *res = NULL;
8254 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008255 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008256 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008258 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 PyObject *errorHandler = NULL;
8260 PyObject *exc = NULL;
8261 /* the following variable is used for caching string comparisons
8262 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8263 * 3=ignore, 4=xmlcharrefreplace */
8264 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008265
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008266 if (PyUnicode_READY(unicode) < 0)
8267 return NULL;
8268 size = PyUnicode_GET_LENGTH(unicode);
8269
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270 /* Default to Latin-1 */
8271 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008272 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 /* allocate enough for a simple encoding without
8275 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008276 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 if (res == NULL)
8278 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008279 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008283 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008284 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008285 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 if (x==enc_EXCEPTION) /* error */
8287 goto onError;
8288 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008289 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 &exc,
8291 &known_errorHandler, &errorHandler, errors,
8292 &res, &respos)) {
8293 goto onError;
8294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008295 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 else
8297 /* done with this character => adjust input position */
8298 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008302 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008303 if (_PyBytes_Resize(&res, respos) < 0)
8304 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 Py_XDECREF(exc);
8307 Py_XDECREF(errorHandler);
8308 return res;
8309
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311 Py_XDECREF(res);
8312 Py_XDECREF(exc);
8313 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008314 return NULL;
8315}
8316
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008317/* Deprecated */
8318PyObject *
8319PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8320 Py_ssize_t size,
8321 PyObject *mapping,
8322 const char *errors)
8323{
8324 PyObject *result;
8325 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8326 if (unicode == NULL)
8327 return NULL;
8328 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8329 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008330 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008331}
8332
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333PyObject *
8334PyUnicode_AsCharmapString(PyObject *unicode,
8335 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336{
8337 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 PyErr_BadArgument();
8339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008341 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342}
8343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008345static void
8346make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008348 Py_ssize_t startpos, Py_ssize_t endpos,
8349 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008350{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008352 *exceptionObject = _PyUnicodeTranslateError_Create(
8353 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008354 }
8355 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8357 goto onError;
8358 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8359 goto onError;
8360 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8361 goto onError;
8362 return;
8363 onError:
8364 Py_DECREF(*exceptionObject);
8365 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 }
8367}
8368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008370static void
8371raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008373 Py_ssize_t startpos, Py_ssize_t endpos,
8374 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375{
8376 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380}
8381
8382/* error handling callback helper:
8383 build arguments, call the callback and check the arguments,
8384 put the result into newpos and return the replacement string, which
8385 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008386static PyObject *
8387unicode_translate_call_errorhandler(const char *errors,
8388 PyObject **errorHandler,
8389 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008391 Py_ssize_t startpos, Py_ssize_t endpos,
8392 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008394 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008396 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 PyObject *restuple;
8398 PyObject *resunicode;
8399
8400 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 }
8405
8406 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410
8411 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008416 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 Py_DECREF(restuple);
8418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 }
8420 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 &resunicode, &i_newpos)) {
8422 Py_DECREF(restuple);
8423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008425 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008427 else
8428 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008429 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8431 Py_DECREF(restuple);
8432 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008433 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 Py_INCREF(resunicode);
8435 Py_DECREF(restuple);
8436 return resunicode;
8437}
8438
8439/* Lookup the character ch in the mapping and put the result in result,
8440 which must be decrefed by the caller.
8441 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008442static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008443charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444{
Christian Heimes217cfd12007-12-02 14:31:20 +00008445 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446 PyObject *x;
8447
8448 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008450 x = PyObject_GetItem(mapping, w);
8451 Py_DECREF(w);
8452 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8454 /* No mapping found means: use 1:1 mapping. */
8455 PyErr_Clear();
8456 *result = NULL;
8457 return 0;
8458 } else
8459 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008460 }
8461 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 *result = x;
8463 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008465 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 long value = PyLong_AS_LONG(x);
8467 long max = PyUnicode_GetMax();
8468 if (value < 0 || value > max) {
8469 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008470 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 Py_DECREF(x);
8472 return -1;
8473 }
8474 *result = x;
8475 return 0;
8476 }
8477 else if (PyUnicode_Check(x)) {
8478 *result = x;
8479 return 0;
8480 }
8481 else {
8482 /* wrong return value */
8483 PyErr_SetString(PyExc_TypeError,
8484 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008485 Py_DECREF(x);
8486 return -1;
8487 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488}
8489/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 if not reallocate and adjust various state variables.
8491 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008492static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008496 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008497 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 /* exponentially overallocate to minimize reallocations */
8499 if (requiredsize < 2 * oldsize)
8500 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8502 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505 }
8506 return 0;
8507}
8508/* lookup the character, put the result in the output string and adjust
8509 various state variables. Return a new reference to the object that
8510 was put in the output buffer in *result, or Py_None, if the mapping was
8511 undefined (in which case no character was written).
8512 The called must decref result.
8513 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008514static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8516 PyObject *mapping, Py_UCS4 **output,
8517 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008518 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008520 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8521 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 }
8527 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008529 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 }
8533 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 Py_ssize_t repsize;
8535 if (PyUnicode_READY(*res) == -1)
8536 return -1;
8537 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 if (repsize==1) {
8539 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 }
8542 else if (repsize!=0) {
8543 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 Py_ssize_t requiredsize = *opos +
8545 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 Py_ssize_t i;
8548 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 for(i = 0; i < repsize; i++)
8551 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553 }
8554 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008556 return 0;
8557}
8558
Alexander Belopolsky40018472011-02-26 01:02:56 +00008559PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560_PyUnicode_TranslateCharmap(PyObject *input,
8561 PyObject *mapping,
8562 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564 /* input object */
8565 char *idata;
8566 Py_ssize_t size, i;
8567 int kind;
8568 /* output buffer */
8569 Py_UCS4 *output = NULL;
8570 Py_ssize_t osize;
8571 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008572 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 char *reason = "character maps to <undefined>";
8575 PyObject *errorHandler = NULL;
8576 PyObject *exc = NULL;
8577 /* the following variable is used for caching string comparisons
8578 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8579 * 3=ignore, 4=xmlcharrefreplace */
8580 int known_errorHandler = -1;
8581
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 PyErr_BadArgument();
8584 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 if (PyUnicode_READY(input) == -1)
8588 return NULL;
8589 idata = (char*)PyUnicode_DATA(input);
8590 kind = PyUnicode_KIND(input);
8591 size = PyUnicode_GET_LENGTH(input);
8592 i = 0;
8593
8594 if (size == 0) {
8595 Py_INCREF(input);
8596 return input;
8597 }
8598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 /* allocate enough for a simple 1:1 translation without
8600 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 osize = size;
8602 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8603 opos = 0;
8604 if (output == NULL) {
8605 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 /* try to encode it */
8611 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 if (charmaptranslate_output(input, i, mapping,
8613 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 Py_XDECREF(x);
8615 goto onError;
8616 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008617 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008620 else { /* untranslatable character */
8621 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8622 Py_ssize_t repsize;
8623 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 Py_ssize_t collstart = i;
8627 Py_ssize_t collend = i+1;
8628 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631 while (collend < size) {
8632 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008633 goto onError;
8634 Py_XDECREF(x);
8635 if (x!=Py_None)
8636 break;
8637 ++collend;
8638 }
8639 /* cache callback name lookup
8640 * (if not done yet, i.e. it's the first error) */
8641 if (known_errorHandler==-1) {
8642 if ((errors==NULL) || (!strcmp(errors, "strict")))
8643 known_errorHandler = 1;
8644 else if (!strcmp(errors, "replace"))
8645 known_errorHandler = 2;
8646 else if (!strcmp(errors, "ignore"))
8647 known_errorHandler = 3;
8648 else if (!strcmp(errors, "xmlcharrefreplace"))
8649 known_errorHandler = 4;
8650 else
8651 known_errorHandler = 0;
8652 }
8653 switch (known_errorHandler) {
8654 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 raise_translate_exception(&exc, input, collstart,
8656 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008657 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 case 2: /* replace */
8659 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 for (coll = collstart; coll<collend; coll++)
8661 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 /* fall through */
8663 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 break;
8666 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 /* generate replacement (temporarily (mis)uses i) */
8668 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 char buffer[2+29+1+1];
8670 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8672 if (charmaptranslate_makespace(&output, &osize,
8673 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 goto onError;
8675 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008678 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 break;
8680 default:
8681 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682 reason, input, &exc,
8683 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008684 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008686 if (PyUnicode_READY(repunicode) < 0) {
8687 Py_DECREF(repunicode);
8688 goto onError;
8689 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008691 repsize = PyUnicode_GET_LENGTH(repunicode);
8692 if (charmaptranslate_makespace(&output, &osize,
8693 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 Py_DECREF(repunicode);
8695 goto onError;
8696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 for (uni2 = 0; repsize-->0; ++uni2)
8698 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8699 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008701 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008702 }
8703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8705 if (!res)
8706 goto onError;
8707 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708 Py_XDECREF(exc);
8709 Py_XDECREF(errorHandler);
8710 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008711
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 Py_XDECREF(exc);
8715 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 return NULL;
8717}
8718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719/* Deprecated. Use PyUnicode_Translate instead. */
8720PyObject *
8721PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8722 Py_ssize_t size,
8723 PyObject *mapping,
8724 const char *errors)
8725{
8726 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8727 if (!unicode)
8728 return NULL;
8729 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8730}
8731
Alexander Belopolsky40018472011-02-26 01:02:56 +00008732PyObject *
8733PyUnicode_Translate(PyObject *str,
8734 PyObject *mapping,
8735 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736{
8737 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008738
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739 str = PyUnicode_FromObject(str);
8740 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008743 Py_DECREF(str);
8744 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008745
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747 Py_XDECREF(str);
8748 return NULL;
8749}
Tim Petersced69f82003-09-16 20:30:58 +00008750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008752fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008753{
8754 /* No need to call PyUnicode_READY(self) because this function is only
8755 called as a callback from fixup() which does it already. */
8756 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8757 const int kind = PyUnicode_KIND(self);
8758 void *data = PyUnicode_DATA(self);
8759 Py_UCS4 maxchar = 0, ch, fixed;
8760 Py_ssize_t i;
8761
8762 for (i = 0; i < len; ++i) {
8763 ch = PyUnicode_READ(kind, data, i);
8764 fixed = 0;
8765 if (ch > 127) {
8766 if (Py_UNICODE_ISSPACE(ch))
8767 fixed = ' ';
8768 else {
8769 const int decimal = Py_UNICODE_TODECIMAL(ch);
8770 if (decimal >= 0)
8771 fixed = '0' + decimal;
8772 }
8773 if (fixed != 0) {
8774 if (fixed > maxchar)
8775 maxchar = fixed;
8776 PyUnicode_WRITE(kind, data, i, fixed);
8777 }
8778 else if (ch > maxchar)
8779 maxchar = ch;
8780 }
8781 else if (ch > maxchar)
8782 maxchar = ch;
8783 }
8784
8785 return maxchar;
8786}
8787
8788PyObject *
8789_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8790{
8791 if (!PyUnicode_Check(unicode)) {
8792 PyErr_BadInternalCall();
8793 return NULL;
8794 }
8795 if (PyUnicode_READY(unicode) == -1)
8796 return NULL;
8797 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8798 /* If the string is already ASCII, just return the same string */
8799 Py_INCREF(unicode);
8800 return unicode;
8801 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008802 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803}
8804
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008805PyObject *
8806PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8807 Py_ssize_t length)
8808{
Victor Stinnerf0124502011-11-21 23:12:56 +01008809 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008810 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008811 Py_UCS4 maxchar;
8812 enum PyUnicode_Kind kind;
8813 void *data;
8814
8815 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008816 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008817 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008818 if (ch > 127) {
8819 int decimal = Py_UNICODE_TODECIMAL(ch);
8820 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008821 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008822 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008823 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008824 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008825
8826 /* Copy to a new string */
8827 decimal = PyUnicode_New(length, maxchar);
8828 if (decimal == NULL)
8829 return decimal;
8830 kind = PyUnicode_KIND(decimal);
8831 data = PyUnicode_DATA(decimal);
8832 /* Iterate over code points */
8833 for (i = 0; i < length; i++) {
8834 Py_UNICODE ch = s[i];
8835 if (ch > 127) {
8836 int decimal = Py_UNICODE_TODECIMAL(ch);
8837 if (decimal >= 0)
8838 ch = '0' + decimal;
8839 }
8840 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008842 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008843}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008844/* --- Decimal Encoder ---------------------------------------------------- */
8845
Alexander Belopolsky40018472011-02-26 01:02:56 +00008846int
8847PyUnicode_EncodeDecimal(Py_UNICODE *s,
8848 Py_ssize_t length,
8849 char *output,
8850 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008851{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008852 PyObject *errorHandler = NULL;
8853 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008854 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008855 const char *encoding = "decimal";
8856 const char *reason = "invalid decimal Unicode string";
8857 /* the following variable is used for caching string comparisons
8858 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8859 int known_errorHandler = -1;
Victor Stinner42bf7752011-11-21 22:52:58 +01008860 Py_ssize_t i, j;
8861 enum PyUnicode_Kind kind;
8862 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008863
8864 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008865 PyErr_BadArgument();
8866 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008867 }
8868
Victor Stinner42bf7752011-11-21 22:52:58 +01008869 unicode = PyUnicode_FromUnicode(s, length);
8870 if (unicode == NULL)
8871 return -1;
8872
8873 if (PyUnicode_READY(unicode) < 0)
8874 goto onError;
8875 kind = PyUnicode_KIND(unicode);
8876 data = PyUnicode_DATA(unicode);
8877
8878 for (i=0; i < length; i++) {
8879 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 int decimal;
Victor Stinner42bf7752011-11-21 22:52:58 +01008881 Py_ssize_t startpos, endpos;
Tim Petersced69f82003-09-16 20:30:58 +00008882
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008884 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008886 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 decimal = Py_UNICODE_TODECIMAL(ch);
8888 if (decimal >= 0) {
8889 *output++ = '0' + decimal;
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 continue;
8891 }
8892 if (0 < ch && ch < 256) {
8893 *output++ = (char)ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 continue;
8895 }
8896 /* All other characters are considered unencodable */
Victor Stinner42bf7752011-11-21 22:52:58 +01008897 startpos = i;
8898 endpos = i+1;
8899 for (; endpos < length; endpos++) {
8900 ch = PyUnicode_READ(kind, data, endpos);
8901 if ((0 < ch && ch < 256) ||
8902 !Py_UNICODE_ISSPACE(ch) ||
8903 Py_UNICODE_TODECIMAL(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 break;
8905 }
8906 /* cache callback name lookup
8907 * (if not done yet, i.e. it's the first error) */
8908 if (known_errorHandler==-1) {
8909 if ((errors==NULL) || (!strcmp(errors, "strict")))
8910 known_errorHandler = 1;
8911 else if (!strcmp(errors, "replace"))
8912 known_errorHandler = 2;
8913 else if (!strcmp(errors, "ignore"))
8914 known_errorHandler = 3;
8915 else if (!strcmp(errors, "xmlcharrefreplace"))
8916 known_errorHandler = 4;
8917 else
8918 known_errorHandler = 0;
8919 }
8920 switch (known_errorHandler) {
8921 case 1: /* strict */
Victor Stinner42bf7752011-11-21 22:52:58 +01008922 raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 goto onError;
8924 case 2: /* replace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008925 for (j=startpos; j < endpos; j++)
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 *output++ = '?';
8927 /* fall through */
8928 case 3: /* ignore */
Victor Stinner42bf7752011-11-21 22:52:58 +01008929 i = endpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 break;
8931 case 4: /* xmlcharrefreplace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008932 /* generate replacement */
8933 for (j=startpos; j < endpos; j++) {
8934 ch = PyUnicode_READ(kind, data, i);
8935 output += sprintf(output, "&#%d;", (int)ch);
8936 i++;
8937 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 break;
8939 default:
Victor Stinner42bf7752011-11-21 22:52:58 +01008940 {
8941 PyObject *repunicode;
8942 Py_ssize_t repsize, newpos, k;
8943 enum PyUnicode_Kind repkind;
8944 void *repdata;
8945
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008947 encoding, reason, unicode, &exc,
Victor Stinner42bf7752011-11-21 22:52:58 +01008948 startpos, endpos, &newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 if (repunicode == NULL)
8950 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008951 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008952 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008953 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8954 Py_DECREF(repunicode);
8955 goto onError;
8956 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008957 if (PyUnicode_READY(repunicode) < 0) {
8958 Py_DECREF(repunicode);
8959 goto onError;
8960 }
8961 repkind = PyUnicode_KIND(repunicode);
8962 repdata = PyUnicode_DATA(repunicode);
8963
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 /* generate replacement */
8965 repsize = PyUnicode_GET_SIZE(repunicode);
Victor Stinner42bf7752011-11-21 22:52:58 +01008966 for (k=0; k<repsize; k++) {
8967 ch = PyUnicode_READ(repkind, repdata, k);
Benjamin Peterson29060642009-01-31 22:14:21 +00008968 if (Py_UNICODE_ISSPACE(ch))
8969 *output++ = ' ';
8970 else {
8971 decimal = Py_UNICODE_TODECIMAL(ch);
8972 if (decimal >= 0)
8973 *output++ = '0' + decimal;
8974 else if (0 < ch && ch < 256)
8975 *output++ = (char)ch;
8976 else {
8977 Py_DECREF(repunicode);
8978 raise_encode_exception(&exc, encoding,
Victor Stinner42bf7752011-11-21 22:52:58 +01008979 unicode, startpos, endpos,
8980 reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 goto onError;
8982 }
8983 }
8984 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008985 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 Py_DECREF(repunicode);
8987 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008988 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008989 }
8990 /* 0-terminate the output string */
8991 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008992 Py_XDECREF(exc);
8993 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008994 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008995 return 0;
8996
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008998 Py_XDECREF(exc);
8999 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01009000 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009001 return -1;
9002}
9003
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004/* --- Helpers ------------------------------------------------------------ */
9005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009007any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 Py_ssize_t start,
9009 Py_ssize_t end)
9010{
9011 int kind1, kind2, kind;
9012 void *buf1, *buf2;
9013 Py_ssize_t len1, len2, result;
9014
9015 kind1 = PyUnicode_KIND(s1);
9016 kind2 = PyUnicode_KIND(s2);
9017 kind = kind1 > kind2 ? kind1 : kind2;
9018 buf1 = PyUnicode_DATA(s1);
9019 buf2 = PyUnicode_DATA(s2);
9020 if (kind1 != kind)
9021 buf1 = _PyUnicode_AsKind(s1, kind);
9022 if (!buf1)
9023 return -2;
9024 if (kind2 != kind)
9025 buf2 = _PyUnicode_AsKind(s2, kind);
9026 if (!buf2) {
9027 if (kind1 != kind) PyMem_Free(buf1);
9028 return -2;
9029 }
9030 len1 = PyUnicode_GET_LENGTH(s1);
9031 len2 = PyUnicode_GET_LENGTH(s2);
9032
Victor Stinner794d5672011-10-10 03:21:36 +02009033 if (direction > 0) {
9034 switch(kind) {
9035 case PyUnicode_1BYTE_KIND:
9036 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9037 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9038 else
9039 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9040 break;
9041 case PyUnicode_2BYTE_KIND:
9042 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9043 break;
9044 case PyUnicode_4BYTE_KIND:
9045 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9046 break;
9047 default:
9048 assert(0); result = -2;
9049 }
9050 }
9051 else {
9052 switch(kind) {
9053 case PyUnicode_1BYTE_KIND:
9054 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9055 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9056 else
9057 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9058 break;
9059 case PyUnicode_2BYTE_KIND:
9060 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9061 break;
9062 case PyUnicode_4BYTE_KIND:
9063 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9064 break;
9065 default:
9066 assert(0); result = -2;
9067 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 }
9069
9070 if (kind1 != kind)
9071 PyMem_Free(buf1);
9072 if (kind2 != kind)
9073 PyMem_Free(buf2);
9074
9075 return result;
9076}
9077
9078Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009079_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009080 Py_ssize_t n_buffer,
9081 void *digits, Py_ssize_t n_digits,
9082 Py_ssize_t min_width,
9083 const char *grouping,
9084 const char *thousands_sep)
9085{
9086 switch(kind) {
9087 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009088 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9089 return _PyUnicode_ascii_InsertThousandsGrouping(
9090 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9091 min_width, grouping, thousands_sep);
9092 else
9093 return _PyUnicode_ucs1_InsertThousandsGrouping(
9094 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9095 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 case PyUnicode_2BYTE_KIND:
9097 return _PyUnicode_ucs2_InsertThousandsGrouping(
9098 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9099 min_width, grouping, thousands_sep);
9100 case PyUnicode_4BYTE_KIND:
9101 return _PyUnicode_ucs4_InsertThousandsGrouping(
9102 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9103 min_width, grouping, thousands_sep);
9104 }
9105 assert(0);
9106 return -1;
9107}
9108
9109
Thomas Wouters477c8d52006-05-27 19:21:47 +00009110/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009111#define ADJUST_INDICES(start, end, len) \
9112 if (end > len) \
9113 end = len; \
9114 else if (end < 0) { \
9115 end += len; \
9116 if (end < 0) \
9117 end = 0; \
9118 } \
9119 if (start < 0) { \
9120 start += len; \
9121 if (start < 0) \
9122 start = 0; \
9123 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009124
Alexander Belopolsky40018472011-02-26 01:02:56 +00009125Py_ssize_t
9126PyUnicode_Count(PyObject *str,
9127 PyObject *substr,
9128 Py_ssize_t start,
9129 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009131 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009132 PyObject* str_obj;
9133 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 int kind1, kind2, kind;
9135 void *buf1 = NULL, *buf2 = NULL;
9136 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009137
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009138 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009141 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009142 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009143 Py_DECREF(str_obj);
9144 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 }
Tim Petersced69f82003-09-16 20:30:58 +00009146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147 kind1 = PyUnicode_KIND(str_obj);
9148 kind2 = PyUnicode_KIND(sub_obj);
9149 kind = kind1 > kind2 ? kind1 : kind2;
9150 buf1 = PyUnicode_DATA(str_obj);
9151 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009152 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 if (!buf1)
9154 goto onError;
9155 buf2 = PyUnicode_DATA(sub_obj);
9156 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009157 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 if (!buf2)
9159 goto onError;
9160 len1 = PyUnicode_GET_LENGTH(str_obj);
9161 len2 = PyUnicode_GET_LENGTH(sub_obj);
9162
9163 ADJUST_INDICES(start, end, len1);
9164 switch(kind) {
9165 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009166 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9167 result = asciilib_count(
9168 ((Py_UCS1*)buf1) + start, end - start,
9169 buf2, len2, PY_SSIZE_T_MAX
9170 );
9171 else
9172 result = ucs1lib_count(
9173 ((Py_UCS1*)buf1) + start, end - start,
9174 buf2, len2, PY_SSIZE_T_MAX
9175 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 break;
9177 case PyUnicode_2BYTE_KIND:
9178 result = ucs2lib_count(
9179 ((Py_UCS2*)buf1) + start, end - start,
9180 buf2, len2, PY_SSIZE_T_MAX
9181 );
9182 break;
9183 case PyUnicode_4BYTE_KIND:
9184 result = ucs4lib_count(
9185 ((Py_UCS4*)buf1) + start, end - start,
9186 buf2, len2, PY_SSIZE_T_MAX
9187 );
9188 break;
9189 default:
9190 assert(0); result = 0;
9191 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009192
9193 Py_DECREF(sub_obj);
9194 Py_DECREF(str_obj);
9195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 if (kind1 != kind)
9197 PyMem_Free(buf1);
9198 if (kind2 != kind)
9199 PyMem_Free(buf2);
9200
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 onError:
9203 Py_DECREF(sub_obj);
9204 Py_DECREF(str_obj);
9205 if (kind1 != kind && buf1)
9206 PyMem_Free(buf1);
9207 if (kind2 != kind && buf2)
9208 PyMem_Free(buf2);
9209 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210}
9211
Alexander Belopolsky40018472011-02-26 01:02:56 +00009212Py_ssize_t
9213PyUnicode_Find(PyObject *str,
9214 PyObject *sub,
9215 Py_ssize_t start,
9216 Py_ssize_t end,
9217 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009219 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009220
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009223 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009224 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009226 Py_DECREF(str);
9227 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228 }
Tim Petersced69f82003-09-16 20:30:58 +00009229
Victor Stinner794d5672011-10-10 03:21:36 +02009230 result = any_find_slice(direction,
9231 str, sub, start, end
9232 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009233
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009235 Py_DECREF(sub);
9236
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237 return result;
9238}
9239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240Py_ssize_t
9241PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9242 Py_ssize_t start, Py_ssize_t end,
9243 int direction)
9244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009246 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 if (PyUnicode_READY(str) == -1)
9248 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009249 if (start < 0 || end < 0) {
9250 PyErr_SetString(PyExc_IndexError, "string index out of range");
9251 return -2;
9252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253 if (end > PyUnicode_GET_LENGTH(str))
9254 end = PyUnicode_GET_LENGTH(str);
9255 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009256 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9257 kind, end-start, ch, direction);
9258 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009260 else
9261 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262}
9263
Alexander Belopolsky40018472011-02-26 01:02:56 +00009264static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009265tailmatch(PyObject *self,
9266 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009267 Py_ssize_t start,
9268 Py_ssize_t end,
9269 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 int kind_self;
9272 int kind_sub;
9273 void *data_self;
9274 void *data_sub;
9275 Py_ssize_t offset;
9276 Py_ssize_t i;
9277 Py_ssize_t end_sub;
9278
9279 if (PyUnicode_READY(self) == -1 ||
9280 PyUnicode_READY(substring) == -1)
9281 return 0;
9282
9283 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284 return 1;
9285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9287 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009289 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 kind_self = PyUnicode_KIND(self);
9292 data_self = PyUnicode_DATA(self);
9293 kind_sub = PyUnicode_KIND(substring);
9294 data_sub = PyUnicode_DATA(substring);
9295 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9296
9297 if (direction > 0)
9298 offset = end;
9299 else
9300 offset = start;
9301
9302 if (PyUnicode_READ(kind_self, data_self, offset) ==
9303 PyUnicode_READ(kind_sub, data_sub, 0) &&
9304 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9305 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9306 /* If both are of the same kind, memcmp is sufficient */
9307 if (kind_self == kind_sub) {
9308 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009309 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 data_sub,
9311 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009312 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 }
9314 /* otherwise we have to compare each character by first accesing it */
9315 else {
9316 /* We do not need to compare 0 and len(substring)-1 because
9317 the if statement above ensured already that they are equal
9318 when we end up here. */
9319 // TODO: honor direction and do a forward or backwards search
9320 for (i = 1; i < end_sub; ++i) {
9321 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9322 PyUnicode_READ(kind_sub, data_sub, i))
9323 return 0;
9324 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009325 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327 }
9328
9329 return 0;
9330}
9331
Alexander Belopolsky40018472011-02-26 01:02:56 +00009332Py_ssize_t
9333PyUnicode_Tailmatch(PyObject *str,
9334 PyObject *substr,
9335 Py_ssize_t start,
9336 Py_ssize_t end,
9337 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009338{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009339 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009340
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341 str = PyUnicode_FromObject(str);
9342 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344 substr = PyUnicode_FromObject(substr);
9345 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 Py_DECREF(str);
9347 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348 }
Tim Petersced69f82003-09-16 20:30:58 +00009349
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009350 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009352 Py_DECREF(str);
9353 Py_DECREF(substr);
9354 return result;
9355}
9356
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357/* Apply fixfct filter to the Unicode object self and return a
9358 reference to the modified object */
9359
Alexander Belopolsky40018472011-02-26 01:02:56 +00009360static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009361fixup(PyObject *self,
9362 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009364 PyObject *u;
9365 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366
Victor Stinner87af4f22011-11-21 23:03:47 +01009367 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009369 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009370 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009371
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372 /* fix functions return the new maximum character in a string,
9373 if the kind of the resulting unicode object does not change,
9374 everything is fine. Otherwise we need to change the string kind
9375 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009376 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 if (maxchar_new == 0)
9378 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9379 else if (maxchar_new <= 127)
9380 maxchar_new = 127;
9381 else if (maxchar_new <= 255)
9382 maxchar_new = 255;
9383 else if (maxchar_new <= 65535)
9384 maxchar_new = 65535;
9385 else
9386 maxchar_new = 1114111; /* 0x10ffff */
9387
9388 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 /* fixfct should return TRUE if it modified the buffer. If
9390 FALSE, return a reference to the original buffer instead
9391 (to save space, not time) */
9392 Py_INCREF(self);
9393 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009394 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396 else if (maxchar_new == maxchar_old) {
9397 return u;
9398 }
9399 else {
9400 /* In case the maximum character changed, we need to
9401 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009402 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 if (v == NULL) {
9404 Py_DECREF(u);
9405 return NULL;
9406 }
9407 if (maxchar_new > maxchar_old) {
9408 /* If the maxchar increased so that the kind changed, not all
9409 characters are representable anymore and we need to fix the
9410 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009411 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009412 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9414 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009415 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009416 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009417 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418
9419 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009420 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421 return v;
9422 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423}
9424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009426fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009428 /* No need to call PyUnicode_READY(self) because this function is only
9429 called as a callback from fixup() which does it already. */
9430 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9431 const int kind = PyUnicode_KIND(self);
9432 void *data = PyUnicode_DATA(self);
9433 int touched = 0;
9434 Py_UCS4 maxchar = 0;
9435 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 for (i = 0; i < len; ++i) {
9438 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9439 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9440 if (up != ch) {
9441 if (up > maxchar)
9442 maxchar = up;
9443 PyUnicode_WRITE(kind, data, i, up);
9444 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446 else if (ch > maxchar)
9447 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448 }
9449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009450 if (touched)
9451 return maxchar;
9452 else
9453 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454}
9455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009457fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9460 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9461 const int kind = PyUnicode_KIND(self);
9462 void *data = PyUnicode_DATA(self);
9463 int touched = 0;
9464 Py_UCS4 maxchar = 0;
9465 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009467 for(i = 0; i < len; ++i) {
9468 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9469 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9470 if (lo != ch) {
9471 if (lo > maxchar)
9472 maxchar = lo;
9473 PyUnicode_WRITE(kind, data, i, lo);
9474 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 else if (ch > maxchar)
9477 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478 }
9479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009480 if (touched)
9481 return maxchar;
9482 else
9483 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009484}
9485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009487fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9490 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9491 const int kind = PyUnicode_KIND(self);
9492 void *data = PyUnicode_DATA(self);
9493 int touched = 0;
9494 Py_UCS4 maxchar = 0;
9495 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 for(i = 0; i < len; ++i) {
9498 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9499 Py_UCS4 nu = 0;
9500
9501 if (Py_UNICODE_ISUPPER(ch))
9502 nu = Py_UNICODE_TOLOWER(ch);
9503 else if (Py_UNICODE_ISLOWER(ch))
9504 nu = Py_UNICODE_TOUPPER(ch);
9505
9506 if (nu != 0) {
9507 if (nu > maxchar)
9508 maxchar = nu;
9509 PyUnicode_WRITE(kind, data, i, nu);
9510 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009512 else if (ch > maxchar)
9513 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009514 }
9515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 if (touched)
9517 return maxchar;
9518 else
9519 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520}
9521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009523fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9526 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9527 const int kind = PyUnicode_KIND(self);
9528 void *data = PyUnicode_DATA(self);
9529 int touched = 0;
9530 Py_UCS4 maxchar = 0;
9531 Py_ssize_t i = 0;
9532 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009533
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009534 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009535 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536
9537 ch = PyUnicode_READ(kind, data, i);
9538 if (!Py_UNICODE_ISUPPER(ch)) {
9539 maxchar = Py_UNICODE_TOUPPER(ch);
9540 PyUnicode_WRITE(kind, data, i, maxchar);
9541 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 ++i;
9544 for(; i < len; ++i) {
9545 ch = PyUnicode_READ(kind, data, i);
9546 if (!Py_UNICODE_ISLOWER(ch)) {
9547 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9548 if (lo > maxchar)
9549 maxchar = lo;
9550 PyUnicode_WRITE(kind, data, i, lo);
9551 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 else if (ch > maxchar)
9554 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009555 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556
9557 if (touched)
9558 return maxchar;
9559 else
9560 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561}
9562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009564fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9567 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9568 const int kind = PyUnicode_KIND(self);
9569 void *data = PyUnicode_DATA(self);
9570 Py_UCS4 maxchar = 0;
9571 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009572 int previous_is_cased;
9573
9574 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009575 if (len == 1) {
9576 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9577 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9578 if (ti != ch) {
9579 PyUnicode_WRITE(kind, data, i, ti);
9580 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009581 }
9582 else
9583 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 for(; i < len; ++i) {
9587 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9588 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009589
Benjamin Peterson29060642009-01-31 22:14:21 +00009590 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009592 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 nu = Py_UNICODE_TOTITLE(ch);
9594
9595 if (nu > maxchar)
9596 maxchar = nu;
9597 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009598
Benjamin Peterson29060642009-01-31 22:14:21 +00009599 if (Py_UNICODE_ISLOWER(ch) ||
9600 Py_UNICODE_ISUPPER(ch) ||
9601 Py_UNICODE_ISTITLE(ch))
9602 previous_is_cased = 1;
9603 else
9604 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607}
9608
Tim Peters8ce9f162004-08-27 01:49:32 +00009609PyObject *
9610PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009613 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009615 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009616 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9617 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009618 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009620 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009622 int use_memcpy;
9623 unsigned char *res_data = NULL, *sep_data = NULL;
9624 PyObject *last_obj;
9625 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626
Tim Peters05eba1f2004-08-27 21:32:02 +00009627 fseq = PySequence_Fast(seq, "");
9628 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009629 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009630 }
9631
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009632 /* NOTE: the following code can't call back into Python code,
9633 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009634 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009635
Tim Peters05eba1f2004-08-27 21:32:02 +00009636 seqlen = PySequence_Fast_GET_SIZE(fseq);
9637 /* If empty sequence, return u"". */
9638 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009639 Py_DECREF(fseq);
9640 Py_INCREF(unicode_empty);
9641 res = unicode_empty;
9642 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009643 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009644
Tim Peters05eba1f2004-08-27 21:32:02 +00009645 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009646 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009647 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009648 if (seqlen == 1) {
9649 if (PyUnicode_CheckExact(items[0])) {
9650 res = items[0];
9651 Py_INCREF(res);
9652 Py_DECREF(fseq);
9653 return res;
9654 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009655 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009656 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009657 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009658 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009659 /* Set up sep and seplen */
9660 if (separator == NULL) {
9661 /* fall back to a blank space separator */
9662 sep = PyUnicode_FromOrdinal(' ');
9663 if (!sep)
9664 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009665 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009666 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009667 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009668 else {
9669 if (!PyUnicode_Check(separator)) {
9670 PyErr_Format(PyExc_TypeError,
9671 "separator: expected str instance,"
9672 " %.80s found",
9673 Py_TYPE(separator)->tp_name);
9674 goto onError;
9675 }
9676 if (PyUnicode_READY(separator))
9677 goto onError;
9678 sep = separator;
9679 seplen = PyUnicode_GET_LENGTH(separator);
9680 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9681 /* inc refcount to keep this code path symmetric with the
9682 above case of a blank separator */
9683 Py_INCREF(sep);
9684 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009685 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009686 }
9687
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009688 /* There are at least two things to join, or else we have a subclass
9689 * of str in the sequence.
9690 * Do a pre-pass to figure out the total amount of space we'll
9691 * need (sz), and see whether all argument are strings.
9692 */
9693 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009694#ifdef Py_DEBUG
9695 use_memcpy = 0;
9696#else
9697 use_memcpy = 1;
9698#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009699 for (i = 0; i < seqlen; i++) {
9700 const Py_ssize_t old_sz = sz;
9701 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009702 if (!PyUnicode_Check(item)) {
9703 PyErr_Format(PyExc_TypeError,
9704 "sequence item %zd: expected str instance,"
9705 " %.80s found",
9706 i, Py_TYPE(item)->tp_name);
9707 goto onError;
9708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 if (PyUnicode_READY(item) == -1)
9710 goto onError;
9711 sz += PyUnicode_GET_LENGTH(item);
9712 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009713 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009714 if (i != 0)
9715 sz += seplen;
9716 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9717 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009718 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009719 goto onError;
9720 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009721 if (use_memcpy && last_obj != NULL) {
9722 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9723 use_memcpy = 0;
9724 }
9725 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009726 }
Tim Petersced69f82003-09-16 20:30:58 +00009727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009729 if (res == NULL)
9730 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009731
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009732 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009733#ifdef Py_DEBUG
9734 use_memcpy = 0;
9735#else
9736 if (use_memcpy) {
9737 res_data = PyUnicode_1BYTE_DATA(res);
9738 kind = PyUnicode_KIND(res);
9739 if (seplen != 0)
9740 sep_data = PyUnicode_1BYTE_DATA(sep);
9741 }
9742#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009744 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009745 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009746 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009747 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009748 if (use_memcpy) {
9749 Py_MEMCPY(res_data,
9750 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009751 kind * seplen);
9752 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009753 }
9754 else {
9755 copy_characters(res, res_offset, sep, 0, seplen);
9756 res_offset += seplen;
9757 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009758 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009759 itemlen = PyUnicode_GET_LENGTH(item);
9760 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009761 if (use_memcpy) {
9762 Py_MEMCPY(res_data,
9763 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009764 kind * itemlen);
9765 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009766 }
9767 else {
9768 copy_characters(res, res_offset, item, 0, itemlen);
9769 res_offset += itemlen;
9770 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009771 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009772 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009773 if (use_memcpy)
9774 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009775 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009776 else
9777 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009778
Tim Peters05eba1f2004-08-27 21:32:02 +00009779 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009781 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009783
Benjamin Peterson29060642009-01-31 22:14:21 +00009784 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009785 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009787 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788 return NULL;
9789}
9790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791#define FILL(kind, data, value, start, length) \
9792 do { \
9793 Py_ssize_t i_ = 0; \
9794 assert(kind != PyUnicode_WCHAR_KIND); \
9795 switch ((kind)) { \
9796 case PyUnicode_1BYTE_KIND: { \
9797 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9798 memset(to_, (unsigned char)value, length); \
9799 break; \
9800 } \
9801 case PyUnicode_2BYTE_KIND: { \
9802 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9803 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9804 break; \
9805 } \
9806 default: { \
9807 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9808 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9809 break; \
9810 } \
9811 } \
9812 } while (0)
9813
Victor Stinner9310abb2011-10-05 00:59:23 +02009814static PyObject *
9815pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009816 Py_ssize_t left,
9817 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 PyObject *u;
9821 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009822 int kind;
9823 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824
9825 if (left < 0)
9826 left = 0;
9827 if (right < 0)
9828 right = 0;
9829
Tim Peters7a29bd52001-09-12 03:03:31 +00009830 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009831 Py_INCREF(self);
9832 return self;
9833 }
9834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9836 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009837 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9838 return NULL;
9839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9841 if (fill > maxchar)
9842 maxchar = fill;
9843 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009844 if (!u)
9845 return NULL;
9846
9847 kind = PyUnicode_KIND(u);
9848 data = PyUnicode_DATA(u);
9849 if (left)
9850 FILL(kind, data, fill, 0, left);
9851 if (right)
9852 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009853 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009854 assert(_PyUnicode_CheckConsistency(u, 1));
9855 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858
Alexander Belopolsky40018472011-02-26 01:02:56 +00009859PyObject *
9860PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863
9864 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 switch(PyUnicode_KIND(string)) {
9869 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009870 if (PyUnicode_IS_ASCII(string))
9871 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009872 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009873 PyUnicode_GET_LENGTH(string), keepends);
9874 else
9875 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009876 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009877 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009878 break;
9879 case PyUnicode_2BYTE_KIND:
9880 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009881 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009882 PyUnicode_GET_LENGTH(string), keepends);
9883 break;
9884 case PyUnicode_4BYTE_KIND:
9885 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009886 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 PyUnicode_GET_LENGTH(string), keepends);
9888 break;
9889 default:
9890 assert(0);
9891 list = 0;
9892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893 Py_DECREF(string);
9894 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895}
9896
Alexander Belopolsky40018472011-02-26 01:02:56 +00009897static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009898split(PyObject *self,
9899 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009900 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 int kind1, kind2, kind;
9903 void *buf1, *buf2;
9904 Py_ssize_t len1, len2;
9905 PyObject* out;
9906
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009908 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 if (PyUnicode_READY(self) == -1)
9911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 if (substring == NULL)
9914 switch(PyUnicode_KIND(self)) {
9915 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009916 if (PyUnicode_IS_ASCII(self))
9917 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009918 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009919 PyUnicode_GET_LENGTH(self), maxcount
9920 );
9921 else
9922 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009923 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009924 PyUnicode_GET_LENGTH(self), maxcount
9925 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 case PyUnicode_2BYTE_KIND:
9927 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009928 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009929 PyUnicode_GET_LENGTH(self), maxcount
9930 );
9931 case PyUnicode_4BYTE_KIND:
9932 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009933 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 PyUnicode_GET_LENGTH(self), maxcount
9935 );
9936 default:
9937 assert(0);
9938 return NULL;
9939 }
9940
9941 if (PyUnicode_READY(substring) == -1)
9942 return NULL;
9943
9944 kind1 = PyUnicode_KIND(self);
9945 kind2 = PyUnicode_KIND(substring);
9946 kind = kind1 > kind2 ? kind1 : kind2;
9947 buf1 = PyUnicode_DATA(self);
9948 buf2 = PyUnicode_DATA(substring);
9949 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009950 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009951 if (!buf1)
9952 return NULL;
9953 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009954 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009955 if (!buf2) {
9956 if (kind1 != kind) PyMem_Free(buf1);
9957 return NULL;
9958 }
9959 len1 = PyUnicode_GET_LENGTH(self);
9960 len2 = PyUnicode_GET_LENGTH(substring);
9961
9962 switch(kind) {
9963 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009964 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9965 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009966 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009967 else
9968 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009969 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 break;
9971 case PyUnicode_2BYTE_KIND:
9972 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009973 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 break;
9975 case PyUnicode_4BYTE_KIND:
9976 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009977 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 break;
9979 default:
9980 out = NULL;
9981 }
9982 if (kind1 != kind)
9983 PyMem_Free(buf1);
9984 if (kind2 != kind)
9985 PyMem_Free(buf2);
9986 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009987}
9988
Alexander Belopolsky40018472011-02-26 01:02:56 +00009989static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009990rsplit(PyObject *self,
9991 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009992 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009993{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 int kind1, kind2, kind;
9995 void *buf1, *buf2;
9996 Py_ssize_t len1, len2;
9997 PyObject* out;
9998
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009999 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010000 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 if (PyUnicode_READY(self) == -1)
10003 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 if (substring == NULL)
10006 switch(PyUnicode_KIND(self)) {
10007 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010008 if (PyUnicode_IS_ASCII(self))
10009 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010010 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010011 PyUnicode_GET_LENGTH(self), maxcount
10012 );
10013 else
10014 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010015 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010016 PyUnicode_GET_LENGTH(self), maxcount
10017 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 case PyUnicode_2BYTE_KIND:
10019 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010020 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 PyUnicode_GET_LENGTH(self), maxcount
10022 );
10023 case PyUnicode_4BYTE_KIND:
10024 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010025 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 PyUnicode_GET_LENGTH(self), maxcount
10027 );
10028 default:
10029 assert(0);
10030 return NULL;
10031 }
10032
10033 if (PyUnicode_READY(substring) == -1)
10034 return NULL;
10035
10036 kind1 = PyUnicode_KIND(self);
10037 kind2 = PyUnicode_KIND(substring);
10038 kind = kind1 > kind2 ? kind1 : kind2;
10039 buf1 = PyUnicode_DATA(self);
10040 buf2 = PyUnicode_DATA(substring);
10041 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010042 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 if (!buf1)
10044 return NULL;
10045 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010046 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 if (!buf2) {
10048 if (kind1 != kind) PyMem_Free(buf1);
10049 return NULL;
10050 }
10051 len1 = PyUnicode_GET_LENGTH(self);
10052 len2 = PyUnicode_GET_LENGTH(substring);
10053
10054 switch(kind) {
10055 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010056 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10057 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010058 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010059 else
10060 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010061 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 break;
10063 case PyUnicode_2BYTE_KIND:
10064 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010065 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 break;
10067 case PyUnicode_4BYTE_KIND:
10068 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010069 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 break;
10071 default:
10072 out = NULL;
10073 }
10074 if (kind1 != kind)
10075 PyMem_Free(buf1);
10076 if (kind2 != kind)
10077 PyMem_Free(buf2);
10078 return out;
10079}
10080
10081static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010082anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10083 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084{
10085 switch(kind) {
10086 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010087 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10088 return asciilib_find(buf1, len1, buf2, len2, offset);
10089 else
10090 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 case PyUnicode_2BYTE_KIND:
10092 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10093 case PyUnicode_4BYTE_KIND:
10094 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10095 }
10096 assert(0);
10097 return -1;
10098}
10099
10100static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010101anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10102 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103{
10104 switch(kind) {
10105 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010106 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10107 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10108 else
10109 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 case PyUnicode_2BYTE_KIND:
10111 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10112 case PyUnicode_4BYTE_KIND:
10113 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10114 }
10115 assert(0);
10116 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010117}
10118
Alexander Belopolsky40018472011-02-26 01:02:56 +000010119static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120replace(PyObject *self, PyObject *str1,
10121 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010122{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 PyObject *u;
10124 char *sbuf = PyUnicode_DATA(self);
10125 char *buf1 = PyUnicode_DATA(str1);
10126 char *buf2 = PyUnicode_DATA(str2);
10127 int srelease = 0, release1 = 0, release2 = 0;
10128 int skind = PyUnicode_KIND(self);
10129 int kind1 = PyUnicode_KIND(str1);
10130 int kind2 = PyUnicode_KIND(str2);
10131 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10132 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10133 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010134 int mayshrink;
10135 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136
10137 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010138 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010140 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141
Victor Stinner59de0ee2011-10-07 10:01:28 +020010142 if (str1 == str2)
10143 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 if (skind < kind1)
10145 /* substring too wide to be present */
10146 goto nothing;
10147
Victor Stinner49a0a212011-10-12 23:46:10 +020010148 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10149 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10150 /* Replacing str1 with str2 may cause a maxchar reduction in the
10151 result string. */
10152 mayshrink = (maxchar_str2 < maxchar);
10153 maxchar = Py_MAX(maxchar, maxchar_str2);
10154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010156 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010157 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010159 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010161 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010162 Py_UCS4 u1, u2;
10163 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010165 if (findchar(sbuf, PyUnicode_KIND(self),
10166 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010167 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010170 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010172 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 rkind = PyUnicode_KIND(u);
10174 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10175 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010176 if (--maxcount < 0)
10177 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010179 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010180 }
10181 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 int rkind = skind;
10183 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 if (kind1 < rkind) {
10186 /* widen substring */
10187 buf1 = _PyUnicode_AsKind(str1, rkind);
10188 if (!buf1) goto error;
10189 release1 = 1;
10190 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010191 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010192 if (i < 0)
10193 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 if (rkind > kind2) {
10195 /* widen replacement */
10196 buf2 = _PyUnicode_AsKind(str2, rkind);
10197 if (!buf2) goto error;
10198 release2 = 1;
10199 }
10200 else if (rkind < kind2) {
10201 /* widen self and buf1 */
10202 rkind = kind2;
10203 if (release1) PyMem_Free(buf1);
10204 sbuf = _PyUnicode_AsKind(self, rkind);
10205 if (!sbuf) goto error;
10206 srelease = 1;
10207 buf1 = _PyUnicode_AsKind(str1, rkind);
10208 if (!buf1) goto error;
10209 release1 = 1;
10210 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010211 u = PyUnicode_New(slen, maxchar);
10212 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010214 assert(PyUnicode_KIND(u) == rkind);
10215 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010216
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010217 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010218 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010219 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010221 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010223
10224 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010225 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010226 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010227 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010228 if (i == -1)
10229 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010232 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010236 }
10237 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 Py_ssize_t n, i, j, ires;
10239 Py_ssize_t product, new_size;
10240 int rkind = skind;
10241 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010244 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 buf1 = _PyUnicode_AsKind(str1, rkind);
10246 if (!buf1) goto error;
10247 release1 = 1;
10248 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010249 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010250 if (n == 0)
10251 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010253 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 buf2 = _PyUnicode_AsKind(str2, rkind);
10255 if (!buf2) goto error;
10256 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010259 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 rkind = kind2;
10261 sbuf = _PyUnicode_AsKind(self, rkind);
10262 if (!sbuf) goto error;
10263 srelease = 1;
10264 if (release1) PyMem_Free(buf1);
10265 buf1 = _PyUnicode_AsKind(str1, rkind);
10266 if (!buf1) goto error;
10267 release1 = 1;
10268 }
10269 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10270 PyUnicode_GET_LENGTH(str1))); */
10271 product = n * (len2-len1);
10272 if ((product / (len2-len1)) != n) {
10273 PyErr_SetString(PyExc_OverflowError,
10274 "replace string is too long");
10275 goto error;
10276 }
10277 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010278 if (new_size == 0) {
10279 Py_INCREF(unicode_empty);
10280 u = unicode_empty;
10281 goto done;
10282 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10284 PyErr_SetString(PyExc_OverflowError,
10285 "replace string is too long");
10286 goto error;
10287 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010288 u = PyUnicode_New(new_size, maxchar);
10289 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010291 assert(PyUnicode_KIND(u) == rkind);
10292 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010293 ires = i = 0;
10294 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010295 while (n-- > 0) {
10296 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010297 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010298 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010299 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010300 if (j == -1)
10301 break;
10302 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010303 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010304 memcpy(res + rkind * ires,
10305 sbuf + rkind * i,
10306 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010308 }
10309 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010311 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010313 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010320 memcpy(res + rkind * ires,
10321 sbuf + rkind * i,
10322 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010323 }
10324 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010325 /* interleave */
10326 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010327 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010329 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010331 if (--n <= 0)
10332 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010333 memcpy(res + rkind * ires,
10334 sbuf + rkind * i,
10335 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 ires++;
10337 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010338 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010339 memcpy(res + rkind * ires,
10340 sbuf + rkind * i,
10341 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010342 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010343 }
10344
10345 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010346 unicode_adjust_maxchar(&u);
10347 if (u == NULL)
10348 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010349 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010350
10351 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 if (srelease)
10353 PyMem_FREE(sbuf);
10354 if (release1)
10355 PyMem_FREE(buf1);
10356 if (release2)
10357 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010358 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010360
Benjamin Peterson29060642009-01-31 22:14:21 +000010361 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010362 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 if (srelease)
10364 PyMem_FREE(sbuf);
10365 if (release1)
10366 PyMem_FREE(buf1);
10367 if (release2)
10368 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010369 if (PyUnicode_CheckExact(self)) {
10370 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010371 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010372 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010373 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 error:
10375 if (srelease && sbuf)
10376 PyMem_FREE(sbuf);
10377 if (release1 && buf1)
10378 PyMem_FREE(buf1);
10379 if (release2 && buf2)
10380 PyMem_FREE(buf2);
10381 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382}
10383
10384/* --- Unicode Object Methods --------------------------------------------- */
10385
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010386PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010387 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388\n\
10389Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010390characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391
10392static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010393unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010395 return fixup(self, fixtitle);
10396}
10397
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010398PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010399 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400\n\
10401Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010402have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403
10404static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010405unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407 return fixup(self, fixcapitalize);
10408}
10409
10410#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010411PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010412 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413\n\
10414Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010415normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416
10417static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010418unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010419{
10420 PyObject *list;
10421 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010422 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423
Guido van Rossumd57fd912000-03-10 22:53:23 +000010424 /* Split into words */
10425 list = split(self, NULL, -1);
10426 if (!list)
10427 return NULL;
10428
10429 /* Capitalize each word */
10430 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010431 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010432 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433 if (item == NULL)
10434 goto onError;
10435 Py_DECREF(PyList_GET_ITEM(list, i));
10436 PyList_SET_ITEM(list, i, item);
10437 }
10438
10439 /* Join the words to form a new string */
10440 item = PyUnicode_Join(NULL, list);
10441
Benjamin Peterson29060642009-01-31 22:14:21 +000010442 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010444 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445}
10446#endif
10447
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010448/* Argument converter. Coerces to a single unicode character */
10449
10450static int
10451convert_uc(PyObject *obj, void *addr)
10452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010454 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010455
Benjamin Peterson14339b62009-01-31 16:36:08 +000010456 uniobj = PyUnicode_FromObject(obj);
10457 if (uniobj == NULL) {
10458 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010459 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010460 return 0;
10461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010463 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010464 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010465 Py_DECREF(uniobj);
10466 return 0;
10467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010469 Py_DECREF(uniobj);
10470 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010471}
10472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010473PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010474 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010476Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010477done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478
10479static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010480unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010481{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010482 Py_ssize_t marg, left;
10483 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 Py_UCS4 fillchar = ' ';
10485
Victor Stinnere9a29352011-10-01 02:14:59 +020010486 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488
Victor Stinnere9a29352011-10-01 02:14:59 +020010489 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490 return NULL;
10491
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010494 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495 }
10496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498 left = marg / 2 + (marg & width & 1);
10499
Victor Stinner9310abb2011-10-05 00:59:23 +020010500 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501}
10502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503/* This function assumes that str1 and str2 are readied by the caller. */
10504
Marc-André Lemburge5034372000-08-08 08:04:29 +000010505static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010506unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010507{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 int kind1, kind2;
10509 void *data1, *data2;
10510 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 kind1 = PyUnicode_KIND(str1);
10513 kind2 = PyUnicode_KIND(str2);
10514 data1 = PyUnicode_DATA(str1);
10515 data2 = PyUnicode_DATA(str2);
10516 len1 = PyUnicode_GET_LENGTH(str1);
10517 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 for (i = 0; i < len1 && i < len2; ++i) {
10520 Py_UCS4 c1, c2;
10521 c1 = PyUnicode_READ(kind1, data1, i);
10522 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010523
10524 if (c1 != c2)
10525 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010526 }
10527
10528 return (len1 < len2) ? -1 : (len1 != len2);
10529}
10530
Alexander Belopolsky40018472011-02-26 01:02:56 +000010531int
10532PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10535 if (PyUnicode_READY(left) == -1 ||
10536 PyUnicode_READY(right) == -1)
10537 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010538 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010540 PyErr_Format(PyExc_TypeError,
10541 "Can't compare %.100s and %.100s",
10542 left->ob_type->tp_name,
10543 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544 return -1;
10545}
10546
Martin v. Löwis5b222132007-06-10 09:51:05 +000010547int
10548PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 Py_ssize_t i;
10551 int kind;
10552 void *data;
10553 Py_UCS4 chr;
10554
Victor Stinner910337b2011-10-03 03:20:16 +020010555 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 if (PyUnicode_READY(uni) == -1)
10557 return -1;
10558 kind = PyUnicode_KIND(uni);
10559 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010560 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10562 if (chr != str[i])
10563 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010564 /* This check keeps Python strings that end in '\0' from comparing equal
10565 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010567 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010568 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010569 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010570 return 0;
10571}
10572
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010573
Benjamin Peterson29060642009-01-31 22:14:21 +000010574#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010575 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010576
Alexander Belopolsky40018472011-02-26 01:02:56 +000010577PyObject *
10578PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010579{
10580 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010581
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010582 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10583 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 if (PyUnicode_READY(left) == -1 ||
10585 PyUnicode_READY(right) == -1)
10586 return NULL;
10587 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10588 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010589 if (op == Py_EQ) {
10590 Py_INCREF(Py_False);
10591 return Py_False;
10592 }
10593 if (op == Py_NE) {
10594 Py_INCREF(Py_True);
10595 return Py_True;
10596 }
10597 }
10598 if (left == right)
10599 result = 0;
10600 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010601 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010602
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010603 /* Convert the return value to a Boolean */
10604 switch (op) {
10605 case Py_EQ:
10606 v = TEST_COND(result == 0);
10607 break;
10608 case Py_NE:
10609 v = TEST_COND(result != 0);
10610 break;
10611 case Py_LE:
10612 v = TEST_COND(result <= 0);
10613 break;
10614 case Py_GE:
10615 v = TEST_COND(result >= 0);
10616 break;
10617 case Py_LT:
10618 v = TEST_COND(result == -1);
10619 break;
10620 case Py_GT:
10621 v = TEST_COND(result == 1);
10622 break;
10623 default:
10624 PyErr_BadArgument();
10625 return NULL;
10626 }
10627 Py_INCREF(v);
10628 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010629 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010630
Brian Curtindfc80e32011-08-10 20:28:54 -050010631 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010632}
10633
Alexander Belopolsky40018472011-02-26 01:02:56 +000010634int
10635PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010636{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 int kind1, kind2, kind;
10639 void *buf1, *buf2;
10640 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010641 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010642
10643 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644 sub = PyUnicode_FromObject(element);
10645 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 PyErr_Format(PyExc_TypeError,
10647 "'in <string>' requires string as left operand, not %s",
10648 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 if (PyUnicode_READY(sub) == -1)
10652 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010653
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010655 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 Py_DECREF(sub);
10657 return -1;
10658 }
10659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 kind1 = PyUnicode_KIND(str);
10661 kind2 = PyUnicode_KIND(sub);
10662 kind = kind1 > kind2 ? kind1 : kind2;
10663 buf1 = PyUnicode_DATA(str);
10664 buf2 = PyUnicode_DATA(sub);
10665 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010666 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 if (!buf1) {
10668 Py_DECREF(sub);
10669 return -1;
10670 }
10671 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010672 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (!buf2) {
10674 Py_DECREF(sub);
10675 if (kind1 != kind) PyMem_Free(buf1);
10676 return -1;
10677 }
10678 len1 = PyUnicode_GET_LENGTH(str);
10679 len2 = PyUnicode_GET_LENGTH(sub);
10680
10681 switch(kind) {
10682 case PyUnicode_1BYTE_KIND:
10683 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10684 break;
10685 case PyUnicode_2BYTE_KIND:
10686 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10687 break;
10688 case PyUnicode_4BYTE_KIND:
10689 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10690 break;
10691 default:
10692 result = -1;
10693 assert(0);
10694 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695
10696 Py_DECREF(str);
10697 Py_DECREF(sub);
10698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 if (kind1 != kind)
10700 PyMem_Free(buf1);
10701 if (kind2 != kind)
10702 PyMem_Free(buf2);
10703
Guido van Rossum403d68b2000-03-13 15:55:09 +000010704 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010705}
10706
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707/* Concat to string or Unicode object giving a new Unicode object. */
10708
Alexander Belopolsky40018472011-02-26 01:02:56 +000010709PyObject *
10710PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010713 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714
10715 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010718 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010721 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722
10723 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010724 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010728 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010729 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 }
10732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010734 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10735 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 w = PyUnicode_New(
10739 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10740 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010743 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10744 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745 Py_DECREF(u);
10746 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010747 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 Py_XDECREF(u);
10752 Py_XDECREF(v);
10753 return NULL;
10754}
10755
Victor Stinnerb0923652011-10-04 01:17:31 +020010756static void
10757unicode_append_inplace(PyObject **p_left, PyObject *right)
10758{
10759 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010760
10761 assert(PyUnicode_IS_READY(*p_left));
10762 assert(PyUnicode_IS_READY(right));
10763
10764 left_len = PyUnicode_GET_LENGTH(*p_left);
10765 right_len = PyUnicode_GET_LENGTH(right);
10766 if (left_len > PY_SSIZE_T_MAX - right_len) {
10767 PyErr_SetString(PyExc_OverflowError,
10768 "strings are too large to concat");
10769 goto error;
10770 }
10771 new_len = left_len + right_len;
10772
10773 /* Now we own the last reference to 'left', so we can resize it
10774 * in-place.
10775 */
10776 if (unicode_resize(p_left, new_len) != 0) {
10777 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10778 * deallocated so it cannot be put back into
10779 * 'variable'. The MemoryError is raised when there
10780 * is no value in 'variable', which might (very
10781 * remotely) be a cause of incompatibilities.
10782 */
10783 goto error;
10784 }
10785 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010786 copy_characters(*p_left, left_len, right, 0, right_len);
10787 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010788 return;
10789
10790error:
10791 Py_DECREF(*p_left);
10792 *p_left = NULL;
10793}
10794
Walter Dörwald1ab83302007-05-18 17:15:44 +000010795void
Victor Stinner23e56682011-10-03 03:54:37 +020010796PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010797{
Victor Stinner23e56682011-10-03 03:54:37 +020010798 PyObject *left, *res;
10799
10800 if (p_left == NULL) {
10801 if (!PyErr_Occurred())
10802 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010803 return;
10804 }
Victor Stinner23e56682011-10-03 03:54:37 +020010805 left = *p_left;
10806 if (right == NULL || !PyUnicode_Check(left)) {
10807 if (!PyErr_Occurred())
10808 PyErr_BadInternalCall();
10809 goto error;
10810 }
10811
Victor Stinnere1335c72011-10-04 20:53:03 +020010812 if (PyUnicode_READY(left))
10813 goto error;
10814 if (PyUnicode_READY(right))
10815 goto error;
10816
Victor Stinner23e56682011-10-03 03:54:37 +020010817 if (PyUnicode_CheckExact(left) && left != unicode_empty
10818 && PyUnicode_CheckExact(right) && right != unicode_empty
10819 && unicode_resizable(left)
10820 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10821 || _PyUnicode_WSTR(left) != NULL))
10822 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010823 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10824 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010825 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010826 not so different than duplicating the string. */
10827 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010828 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010829 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010830 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010831 return;
10832 }
10833 }
10834
10835 res = PyUnicode_Concat(left, right);
10836 if (res == NULL)
10837 goto error;
10838 Py_DECREF(left);
10839 *p_left = res;
10840 return;
10841
10842error:
10843 Py_DECREF(*p_left);
10844 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010845}
10846
10847void
10848PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10849{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010850 PyUnicode_Append(pleft, right);
10851 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010852}
10853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010854PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010855 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010857Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010858string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010859interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860
10861static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010862unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010864 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010865 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010866 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 int kind1, kind2, kind;
10869 void *buf1, *buf2;
10870 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871
Jesus Ceaac451502011-04-20 17:09:23 +020010872 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10873 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010874 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 kind1 = PyUnicode_KIND(self);
10877 kind2 = PyUnicode_KIND(substring);
10878 kind = kind1 > kind2 ? kind1 : kind2;
10879 buf1 = PyUnicode_DATA(self);
10880 buf2 = PyUnicode_DATA(substring);
10881 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010882 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 if (!buf1) {
10884 Py_DECREF(substring);
10885 return NULL;
10886 }
10887 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010888 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889 if (!buf2) {
10890 Py_DECREF(substring);
10891 if (kind1 != kind) PyMem_Free(buf1);
10892 return NULL;
10893 }
10894 len1 = PyUnicode_GET_LENGTH(self);
10895 len2 = PyUnicode_GET_LENGTH(substring);
10896
10897 ADJUST_INDICES(start, end, len1);
10898 switch(kind) {
10899 case PyUnicode_1BYTE_KIND:
10900 iresult = ucs1lib_count(
10901 ((Py_UCS1*)buf1) + start, end - start,
10902 buf2, len2, PY_SSIZE_T_MAX
10903 );
10904 break;
10905 case PyUnicode_2BYTE_KIND:
10906 iresult = ucs2lib_count(
10907 ((Py_UCS2*)buf1) + start, end - start,
10908 buf2, len2, PY_SSIZE_T_MAX
10909 );
10910 break;
10911 case PyUnicode_4BYTE_KIND:
10912 iresult = ucs4lib_count(
10913 ((Py_UCS4*)buf1) + start, end - start,
10914 buf2, len2, PY_SSIZE_T_MAX
10915 );
10916 break;
10917 default:
10918 assert(0); iresult = 0;
10919 }
10920
10921 result = PyLong_FromSsize_t(iresult);
10922
10923 if (kind1 != kind)
10924 PyMem_Free(buf1);
10925 if (kind2 != kind)
10926 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927
10928 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010929
Guido van Rossumd57fd912000-03-10 22:53:23 +000010930 return result;
10931}
10932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010933PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010934 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010936Encode S using the codec registered for encoding. Default encoding\n\
10937is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010938handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010939a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10940'xmlcharrefreplace' as well as any other name registered with\n\
10941codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942
10943static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010944unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010946 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947 char *encoding = NULL;
10948 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010949
Benjamin Peterson308d6372009-09-18 21:42:35 +000010950 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10951 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010953 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010954}
10955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010956PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010957 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958\n\
10959Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010960If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961
10962static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010963unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010965 Py_ssize_t i, j, line_pos, src_len, incr;
10966 Py_UCS4 ch;
10967 PyObject *u;
10968 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010970 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010971 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972
10973 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975
Antoine Pitrou22425222011-10-04 19:10:51 +020010976 if (PyUnicode_READY(self) == -1)
10977 return NULL;
10978
Thomas Wouters7e474022000-07-16 12:04:32 +000010979 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010980 src_len = PyUnicode_GET_LENGTH(self);
10981 i = j = line_pos = 0;
10982 kind = PyUnicode_KIND(self);
10983 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010984 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010985 for (; i < src_len; i++) {
10986 ch = PyUnicode_READ(kind, src_data, i);
10987 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010988 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010989 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010990 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010992 goto overflow;
10993 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010994 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010995 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010996 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010998 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010999 goto overflow;
11000 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011002 if (ch == '\n' || ch == '\r')
11003 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011005 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011006 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010011007 Py_INCREF(self);
11008 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011009 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011010
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011012 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 if (!u)
11014 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011015 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016
Antoine Pitroue71d5742011-10-04 15:55:09 +020011017 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018
Antoine Pitroue71d5742011-10-04 15:55:09 +020011019 for (; i < src_len; i++) {
11020 ch = PyUnicode_READ(kind, src_data, i);
11021 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011023 incr = tabsize - (line_pos % tabsize);
11024 line_pos += incr;
11025 while (incr--) {
11026 PyUnicode_WRITE(kind, dest_data, j, ' ');
11027 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011028 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011029 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011030 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011031 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011032 line_pos++;
11033 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011034 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011035 if (ch == '\n' || ch == '\r')
11036 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011038 }
11039 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011040 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011041
Antoine Pitroue71d5742011-10-04 15:55:09 +020011042 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011043 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11044 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045}
11046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011047PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011048 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049\n\
11050Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011051such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052arguments start and end are interpreted as in slice notation.\n\
11053\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011054Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055
11056static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011059 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011060 Py_ssize_t start;
11061 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011062 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063
Jesus Ceaac451502011-04-20 17:09:23 +020011064 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11065 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 if (PyUnicode_READY(self) == -1)
11069 return NULL;
11070 if (PyUnicode_READY(substring) == -1)
11071 return NULL;
11072
Victor Stinner7931d9a2011-11-04 00:22:48 +010011073 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074
11075 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 if (result == -2)
11078 return NULL;
11079
Christian Heimes217cfd12007-12-02 14:31:20 +000011080 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081}
11082
11083static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011084unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011086 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11087 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090}
11091
Guido van Rossumc2504932007-09-18 19:42:40 +000011092/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011093 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011094static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011095unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096{
Guido van Rossumc2504932007-09-18 19:42:40 +000011097 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011098 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 if (_PyUnicode_HASH(self) != -1)
11101 return _PyUnicode_HASH(self);
11102 if (PyUnicode_READY(self) == -1)
11103 return -1;
11104 len = PyUnicode_GET_LENGTH(self);
11105
11106 /* The hash function as a macro, gets expanded three times below. */
11107#define HASH(P) \
11108 x = (Py_uhash_t)*P << 7; \
11109 while (--len >= 0) \
11110 x = (1000003*x) ^ (Py_uhash_t)*P++;
11111
11112 switch (PyUnicode_KIND(self)) {
11113 case PyUnicode_1BYTE_KIND: {
11114 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11115 HASH(c);
11116 break;
11117 }
11118 case PyUnicode_2BYTE_KIND: {
11119 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11120 HASH(s);
11121 break;
11122 }
11123 default: {
11124 Py_UCS4 *l;
11125 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11126 "Impossible switch case in unicode_hash");
11127 l = PyUnicode_4BYTE_DATA(self);
11128 HASH(l);
11129 break;
11130 }
11131 }
11132 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11133
Guido van Rossumc2504932007-09-18 19:42:40 +000011134 if (x == -1)
11135 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011137 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011141PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011142 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011144Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145
11146static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011149 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011150 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011151 Py_ssize_t start;
11152 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153
Jesus Ceaac451502011-04-20 17:09:23 +020011154 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11155 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 if (PyUnicode_READY(self) == -1)
11159 return NULL;
11160 if (PyUnicode_READY(substring) == -1)
11161 return NULL;
11162
Victor Stinner7931d9a2011-11-04 00:22:48 +010011163 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164
11165 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167 if (result == -2)
11168 return NULL;
11169
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170 if (result < 0) {
11171 PyErr_SetString(PyExc_ValueError, "substring not found");
11172 return NULL;
11173 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011174
Christian Heimes217cfd12007-12-02 14:31:20 +000011175 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176}
11177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011178PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011181Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011182at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183
11184static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011185unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011187 Py_ssize_t i, length;
11188 int kind;
11189 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190 int cased;
11191
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 if (PyUnicode_READY(self) == -1)
11193 return NULL;
11194 length = PyUnicode_GET_LENGTH(self);
11195 kind = PyUnicode_KIND(self);
11196 data = PyUnicode_DATA(self);
11197
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 if (length == 1)
11200 return PyBool_FromLong(
11201 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011203 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011205 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011206
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 for (i = 0; i < length; i++) {
11209 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011210
Benjamin Peterson29060642009-01-31 22:14:21 +000011211 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11212 return PyBool_FromLong(0);
11213 else if (!cased && Py_UNICODE_ISLOWER(ch))
11214 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011216 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217}
11218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011219PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011220 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011222Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011223at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224
11225static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011226unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 Py_ssize_t i, length;
11229 int kind;
11230 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231 int cased;
11232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011233 if (PyUnicode_READY(self) == -1)
11234 return NULL;
11235 length = PyUnicode_GET_LENGTH(self);
11236 kind = PyUnicode_KIND(self);
11237 data = PyUnicode_DATA(self);
11238
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 if (length == 1)
11241 return PyBool_FromLong(
11242 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011244 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011247
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 for (i = 0; i < length; i++) {
11250 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011251
Benjamin Peterson29060642009-01-31 22:14:21 +000011252 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11253 return PyBool_FromLong(0);
11254 else if (!cased && Py_UNICODE_ISUPPER(ch))
11255 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011257 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258}
11259
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011260PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011263Return True if S is a titlecased string and there is at least one\n\
11264character in S, i.e. upper- and titlecase characters may only\n\
11265follow uncased characters and lowercase characters only cased ones.\n\
11266Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267
11268static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011269unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011271 Py_ssize_t i, length;
11272 int kind;
11273 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 int cased, previous_is_cased;
11275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276 if (PyUnicode_READY(self) == -1)
11277 return NULL;
11278 length = PyUnicode_GET_LENGTH(self);
11279 kind = PyUnicode_KIND(self);
11280 data = PyUnicode_DATA(self);
11281
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 if (length == 1) {
11284 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11285 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11286 (Py_UNICODE_ISUPPER(ch) != 0));
11287 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011289 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011292
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 cased = 0;
11294 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 for (i = 0; i < length; i++) {
11296 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011297
Benjamin Peterson29060642009-01-31 22:14:21 +000011298 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11299 if (previous_is_cased)
11300 return PyBool_FromLong(0);
11301 previous_is_cased = 1;
11302 cased = 1;
11303 }
11304 else if (Py_UNICODE_ISLOWER(ch)) {
11305 if (!previous_is_cased)
11306 return PyBool_FromLong(0);
11307 previous_is_cased = 1;
11308 cased = 1;
11309 }
11310 else
11311 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011313 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314}
11315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011316PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011317 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011319Return True if all characters in S are whitespace\n\
11320and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321
11322static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011323unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 Py_ssize_t i, length;
11326 int kind;
11327 void *data;
11328
11329 if (PyUnicode_READY(self) == -1)
11330 return NULL;
11331 length = PyUnicode_GET_LENGTH(self);
11332 kind = PyUnicode_KIND(self);
11333 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 if (length == 1)
11337 return PyBool_FromLong(
11338 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011340 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011342 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 for (i = 0; i < length; i++) {
11345 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011346 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011349 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350}
11351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011352PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011353 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011354\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011355Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011356and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011357
11358static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011359unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 Py_ssize_t i, length;
11362 int kind;
11363 void *data;
11364
11365 if (PyUnicode_READY(self) == -1)
11366 return NULL;
11367 length = PyUnicode_GET_LENGTH(self);
11368 kind = PyUnicode_KIND(self);
11369 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011370
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011371 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372 if (length == 1)
11373 return PyBool_FromLong(
11374 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011375
11376 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 for (i = 0; i < length; i++) {
11381 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011382 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011383 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011384 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011385}
11386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011387PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011390Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011391and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011392
11393static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011394unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011395{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 int kind;
11397 void *data;
11398 Py_ssize_t len, i;
11399
11400 if (PyUnicode_READY(self) == -1)
11401 return NULL;
11402
11403 kind = PyUnicode_KIND(self);
11404 data = PyUnicode_DATA(self);
11405 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011406
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011407 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 if (len == 1) {
11409 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11410 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11411 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011412
11413 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 for (i = 0; i < len; i++) {
11418 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011419 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011420 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011421 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011422 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011423}
11424
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011425PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011426 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011428Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011432unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 Py_ssize_t i, length;
11435 int kind;
11436 void *data;
11437
11438 if (PyUnicode_READY(self) == -1)
11439 return NULL;
11440 length = PyUnicode_GET_LENGTH(self);
11441 kind = PyUnicode_KIND(self);
11442 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 if (length == 1)
11446 return PyBool_FromLong(
11447 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011449 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 for (i = 0; i < length; i++) {
11454 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011455 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011457 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458}
11459
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011461 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011463Return True if all characters in S are digits\n\
11464and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465
11466static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011467unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 Py_ssize_t i, length;
11470 int kind;
11471 void *data;
11472
11473 if (PyUnicode_READY(self) == -1)
11474 return NULL;
11475 length = PyUnicode_GET_LENGTH(self);
11476 kind = PyUnicode_KIND(self);
11477 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 if (length == 1) {
11481 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11482 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011485 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 for (i = 0; i < length; i++) {
11490 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011491 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011493 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494}
11495
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011496PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011497 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011499Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011500False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
11502static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011503unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 Py_ssize_t i, length;
11506 int kind;
11507 void *data;
11508
11509 if (PyUnicode_READY(self) == -1)
11510 return NULL;
11511 length = PyUnicode_GET_LENGTH(self);
11512 kind = PyUnicode_KIND(self);
11513 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 if (length == 1)
11517 return PyBool_FromLong(
11518 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011520 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 for (i = 0; i < length; i++) {
11525 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011528 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529}
11530
Martin v. Löwis47383402007-08-15 07:32:56 +000011531int
11532PyUnicode_IsIdentifier(PyObject *self)
11533{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 int kind;
11535 void *data;
11536 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011537 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 if (PyUnicode_READY(self) == -1) {
11540 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011541 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 }
11543
11544 /* Special case for empty strings */
11545 if (PyUnicode_GET_LENGTH(self) == 0)
11546 return 0;
11547 kind = PyUnicode_KIND(self);
11548 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011549
11550 /* PEP 3131 says that the first character must be in
11551 XID_Start and subsequent characters in XID_Continue,
11552 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011553 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011554 letters, digits, underscore). However, given the current
11555 definition of XID_Start and XID_Continue, it is sufficient
11556 to check just for these, except that _ must be allowed
11557 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011559 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011560 return 0;
11561
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011562 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011565 return 1;
11566}
11567
11568PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011569 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011570\n\
11571Return True if S is a valid identifier according\n\
11572to the language definition.");
11573
11574static PyObject*
11575unicode_isidentifier(PyObject *self)
11576{
11577 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11578}
11579
Georg Brandl559e5d72008-06-11 18:37:52 +000011580PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011582\n\
11583Return True if all characters in S are considered\n\
11584printable in repr() or S is empty, False otherwise.");
11585
11586static PyObject*
11587unicode_isprintable(PyObject *self)
11588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 Py_ssize_t i, length;
11590 int kind;
11591 void *data;
11592
11593 if (PyUnicode_READY(self) == -1)
11594 return NULL;
11595 length = PyUnicode_GET_LENGTH(self);
11596 kind = PyUnicode_KIND(self);
11597 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011598
11599 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 if (length == 1)
11601 return PyBool_FromLong(
11602 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011603
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 for (i = 0; i < length; i++) {
11605 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011606 Py_RETURN_FALSE;
11607 }
11608 }
11609 Py_RETURN_TRUE;
11610}
11611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011612PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011613 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614\n\
11615Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011616iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617
11618static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011619unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011621 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622}
11623
Martin v. Löwis18e16552006-02-15 17:27:45 +000011624static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011625unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 if (PyUnicode_READY(self) == -1)
11628 return -1;
11629 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630}
11631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011632PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011635Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011636done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637
11638static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011639unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011641 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 Py_UCS4 fillchar = ' ';
11643
11644 if (PyUnicode_READY(self) == -1)
11645 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011646
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011647 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 return NULL;
11649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011651 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011652 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653 }
11654
Victor Stinner7931d9a2011-11-04 00:22:48 +010011655 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656}
11657
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011658PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011659 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011661Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662
11663static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011664unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666 return fixup(self, fixlower);
11667}
11668
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011669#define LEFTSTRIP 0
11670#define RIGHTSTRIP 1
11671#define BOTHSTRIP 2
11672
11673/* Arrays indexed by above */
11674static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11675
11676#define STRIPNAME(i) (stripformat[i]+3)
11677
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011678/* externally visible for str.strip(unicode) */
11679PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011680_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011681{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 void *data;
11683 int kind;
11684 Py_ssize_t i, j, len;
11685 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11688 return NULL;
11689
11690 kind = PyUnicode_KIND(self);
11691 data = PyUnicode_DATA(self);
11692 len = PyUnicode_GET_LENGTH(self);
11693 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11694 PyUnicode_DATA(sepobj),
11695 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011696
Benjamin Peterson14339b62009-01-31 16:36:08 +000011697 i = 0;
11698 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 while (i < len &&
11700 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011701 i++;
11702 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011703 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704
Benjamin Peterson14339b62009-01-31 16:36:08 +000011705 j = len;
11706 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011707 do {
11708 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 } while (j >= i &&
11710 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011711 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011712 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011713
Victor Stinner7931d9a2011-11-04 00:22:48 +010011714 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715}
11716
11717PyObject*
11718PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11719{
11720 unsigned char *data;
11721 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011722 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723
Victor Stinnerde636f32011-10-01 03:55:54 +020011724 if (PyUnicode_READY(self) == -1)
11725 return NULL;
11726
11727 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11728
Victor Stinner12bab6d2011-10-01 01:53:49 +020011729 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011731 if (PyUnicode_CheckExact(self)) {
11732 Py_INCREF(self);
11733 return self;
11734 }
11735 else
11736 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 }
11738
Victor Stinner12bab6d2011-10-01 01:53:49 +020011739 length = end - start;
11740 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011741 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742
Victor Stinnerde636f32011-10-01 03:55:54 +020011743 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011744 PyErr_SetString(PyExc_IndexError, "string index out of range");
11745 return NULL;
11746 }
11747
Victor Stinnerb9275c12011-10-05 14:01:42 +020011748 if (PyUnicode_IS_ASCII(self)) {
11749 kind = PyUnicode_KIND(self);
11750 data = PyUnicode_1BYTE_DATA(self);
11751 return unicode_fromascii(data + start, length);
11752 }
11753 else {
11754 kind = PyUnicode_KIND(self);
11755 data = PyUnicode_1BYTE_DATA(self);
11756 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011757 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011758 length);
11759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761
11762static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011763do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 int kind;
11766 void *data;
11767 Py_ssize_t len, i, j;
11768
11769 if (PyUnicode_READY(self) == -1)
11770 return NULL;
11771
11772 kind = PyUnicode_KIND(self);
11773 data = PyUnicode_DATA(self);
11774 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011775
Benjamin Peterson14339b62009-01-31 16:36:08 +000011776 i = 0;
11777 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011779 i++;
11780 }
11781 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011782
Benjamin Peterson14339b62009-01-31 16:36:08 +000011783 j = len;
11784 if (striptype != LEFTSTRIP) {
11785 do {
11786 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011788 j++;
11789 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011790
Victor Stinner7931d9a2011-11-04 00:22:48 +010011791 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792}
11793
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011794
11795static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011796do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011797{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011798 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011799
Benjamin Peterson14339b62009-01-31 16:36:08 +000011800 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11801 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011802
Benjamin Peterson14339b62009-01-31 16:36:08 +000011803 if (sep != NULL && sep != Py_None) {
11804 if (PyUnicode_Check(sep))
11805 return _PyUnicode_XStrip(self, striptype, sep);
11806 else {
11807 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011808 "%s arg must be None or str",
11809 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011810 return NULL;
11811 }
11812 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011813
Benjamin Peterson14339b62009-01-31 16:36:08 +000011814 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011815}
11816
11817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011818PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011819 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011820\n\
11821Return a copy of the string S with leading and trailing\n\
11822whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011823If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011824
11825static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011826unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011827{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011828 if (PyTuple_GET_SIZE(args) == 0)
11829 return do_strip(self, BOTHSTRIP); /* Common case */
11830 else
11831 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011832}
11833
11834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011835PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011836 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011837\n\
11838Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011839If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011840
11841static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011842unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011843{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011844 if (PyTuple_GET_SIZE(args) == 0)
11845 return do_strip(self, LEFTSTRIP); /* Common case */
11846 else
11847 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011848}
11849
11850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011851PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011852 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011853\n\
11854Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011855If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011856
11857static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011858unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011859{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011860 if (PyTuple_GET_SIZE(args) == 0)
11861 return do_strip(self, RIGHTSTRIP); /* Common case */
11862 else
11863 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011864}
11865
11866
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011868unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011870 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872
Georg Brandl222de0f2009-04-12 12:01:50 +000011873 if (len < 1) {
11874 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011875 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011876 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877
Tim Peters7a29bd52001-09-12 03:03:31 +000011878 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879 /* no repeat, return original string */
11880 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011881 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882 }
Tim Peters8f422462000-09-09 06:13:41 +000011883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 if (PyUnicode_READY(str) == -1)
11885 return NULL;
11886
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011887 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011888 PyErr_SetString(PyExc_OverflowError,
11889 "repeated string is too long");
11890 return NULL;
11891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011893
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011894 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895 if (!u)
11896 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011897 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 if (PyUnicode_GET_LENGTH(str) == 1) {
11900 const int kind = PyUnicode_KIND(str);
11901 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11902 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011903 if (kind == PyUnicode_1BYTE_KIND)
11904 memset(to, (unsigned char)fill_char, len);
11905 else {
11906 for (n = 0; n < len; ++n)
11907 PyUnicode_WRITE(kind, to, n, fill_char);
11908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 }
11910 else {
11911 /* number of characters copied this far */
11912 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011913 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 char *to = (char *) PyUnicode_DATA(u);
11915 Py_MEMCPY(to, PyUnicode_DATA(str),
11916 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011917 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011918 n = (done <= nchars-done) ? done : nchars-done;
11919 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011920 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922 }
11923
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011924 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011925 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926}
11927
Alexander Belopolsky40018472011-02-26 01:02:56 +000011928PyObject *
11929PyUnicode_Replace(PyObject *obj,
11930 PyObject *subobj,
11931 PyObject *replobj,
11932 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933{
11934 PyObject *self;
11935 PyObject *str1;
11936 PyObject *str2;
11937 PyObject *result;
11938
11939 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011940 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011943 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011944 Py_DECREF(self);
11945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946 }
11947 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011948 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011949 Py_DECREF(self);
11950 Py_DECREF(str1);
11951 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954 Py_DECREF(self);
11955 Py_DECREF(str1);
11956 Py_DECREF(str2);
11957 return result;
11958}
11959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011960PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011961 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962\n\
11963Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011964old replaced by new. If the optional argument count is\n\
11965given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966
11967static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 PyObject *str1;
11971 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011972 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973 PyObject *result;
11974
Martin v. Löwis18e16552006-02-15 17:27:45 +000011975 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011978 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 str1 = PyUnicode_FromObject(str1);
11980 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11981 return NULL;
11982 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011983 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011984 Py_DECREF(str1);
11985 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987
11988 result = replace(self, str1, str2, maxcount);
11989
11990 Py_DECREF(str1);
11991 Py_DECREF(str2);
11992 return result;
11993}
11994
Alexander Belopolsky40018472011-02-26 01:02:56 +000011995static PyObject *
11996unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011998 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 Py_ssize_t isize;
12000 Py_ssize_t osize, squote, dquote, i, o;
12001 Py_UCS4 max, quote;
12002 int ikind, okind;
12003 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012006 return NULL;
12007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 isize = PyUnicode_GET_LENGTH(unicode);
12009 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 /* Compute length of output, quote characters, and
12012 maximum character */
12013 osize = 2; /* quotes */
12014 max = 127;
12015 squote = dquote = 0;
12016 ikind = PyUnicode_KIND(unicode);
12017 for (i = 0; i < isize; i++) {
12018 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12019 switch (ch) {
12020 case '\'': squote++; osize++; break;
12021 case '"': dquote++; osize++; break;
12022 case '\\': case '\t': case '\r': case '\n':
12023 osize += 2; break;
12024 default:
12025 /* Fast-path ASCII */
12026 if (ch < ' ' || ch == 0x7f)
12027 osize += 4; /* \xHH */
12028 else if (ch < 0x7f)
12029 osize++;
12030 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12031 osize++;
12032 max = ch > max ? ch : max;
12033 }
12034 else if (ch < 0x100)
12035 osize += 4; /* \xHH */
12036 else if (ch < 0x10000)
12037 osize += 6; /* \uHHHH */
12038 else
12039 osize += 10; /* \uHHHHHHHH */
12040 }
12041 }
12042
12043 quote = '\'';
12044 if (squote) {
12045 if (dquote)
12046 /* Both squote and dquote present. Use squote,
12047 and escape them */
12048 osize += squote;
12049 else
12050 quote = '"';
12051 }
12052
12053 repr = PyUnicode_New(osize, max);
12054 if (repr == NULL)
12055 return NULL;
12056 okind = PyUnicode_KIND(repr);
12057 odata = PyUnicode_DATA(repr);
12058
12059 PyUnicode_WRITE(okind, odata, 0, quote);
12060 PyUnicode_WRITE(okind, odata, osize-1, quote);
12061
12062 for (i = 0, o = 1; i < isize; i++) {
12063 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012064
12065 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 if ((ch == quote) || (ch == '\\')) {
12067 PyUnicode_WRITE(okind, odata, o++, '\\');
12068 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012069 continue;
12070 }
12071
Benjamin Peterson29060642009-01-31 22:14:21 +000012072 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012073 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 PyUnicode_WRITE(okind, odata, o++, '\\');
12075 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012076 }
12077 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 PyUnicode_WRITE(okind, odata, o++, '\\');
12079 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012080 }
12081 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 PyUnicode_WRITE(okind, odata, o++, '\\');
12083 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012084 }
12085
12086 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012087 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 PyUnicode_WRITE(okind, odata, o++, '\\');
12089 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012090 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12091 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012092 }
12093
Georg Brandl559e5d72008-06-11 18:37:52 +000012094 /* Copy ASCII characters as-is */
12095 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012097 }
12098
Benjamin Peterson29060642009-01-31 22:14:21 +000012099 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012100 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012101 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012102 (categories Z* and C* except ASCII space)
12103 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012105 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 if (ch <= 0xff) {
12107 PyUnicode_WRITE(okind, odata, o++, '\\');
12108 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012109 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12110 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012111 }
12112 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 else if (ch >= 0x10000) {
12114 PyUnicode_WRITE(okind, odata, o++, '\\');
12115 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012116 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12117 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12118 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12119 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12120 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12121 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012124 }
12125 /* Map 16-bit characters to '\uxxxx' */
12126 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 PyUnicode_WRITE(okind, odata, o++, '\\');
12128 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012129 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12130 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12131 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12132 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012133 }
12134 }
12135 /* Copy characters as-is */
12136 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012138 }
12139 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012142 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012143 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144}
12145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012146PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012147 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148\n\
12149Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012150such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151arguments start and end are interpreted as in slice notation.\n\
12152\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012153Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
12155static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012158 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012159 Py_ssize_t start;
12160 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012161 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162
Jesus Ceaac451502011-04-20 17:09:23 +020012163 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12164 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 if (PyUnicode_READY(self) == -1)
12168 return NULL;
12169 if (PyUnicode_READY(substring) == -1)
12170 return NULL;
12171
Victor Stinner7931d9a2011-11-04 00:22:48 +010012172 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173
12174 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 if (result == -2)
12177 return NULL;
12178
Christian Heimes217cfd12007-12-02 14:31:20 +000012179 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180}
12181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012182PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012183 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012185Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186
12187static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012190 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012191 Py_ssize_t start;
12192 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012193 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194
Jesus Ceaac451502011-04-20 17:09:23 +020012195 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12196 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 if (PyUnicode_READY(self) == -1)
12200 return NULL;
12201 if (PyUnicode_READY(substring) == -1)
12202 return NULL;
12203
Victor Stinner7931d9a2011-11-04 00:22:48 +010012204 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205
12206 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 if (result == -2)
12209 return NULL;
12210
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211 if (result < 0) {
12212 PyErr_SetString(PyExc_ValueError, "substring not found");
12213 return NULL;
12214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215
Christian Heimes217cfd12007-12-02 14:31:20 +000012216 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217}
12218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012219PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012220 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012222Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012223done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224
12225static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012226unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012228 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 Py_UCS4 fillchar = ' ';
12230
Victor Stinnere9a29352011-10-01 02:14:59 +020012231 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012233
Victor Stinnere9a29352011-10-01 02:14:59 +020012234 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235 return NULL;
12236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012239 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240 }
12241
Victor Stinner7931d9a2011-11-04 00:22:48 +010012242 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243}
12244
Alexander Belopolsky40018472011-02-26 01:02:56 +000012245PyObject *
12246PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247{
12248 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012249
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250 s = PyUnicode_FromObject(s);
12251 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012252 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 if (sep != NULL) {
12254 sep = PyUnicode_FromObject(sep);
12255 if (sep == NULL) {
12256 Py_DECREF(s);
12257 return NULL;
12258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259 }
12260
Victor Stinner9310abb2011-10-05 00:59:23 +020012261 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262
12263 Py_DECREF(s);
12264 Py_XDECREF(sep);
12265 return result;
12266}
12267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012268PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012269 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270\n\
12271Return a list of the words in S, using sep as the\n\
12272delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012273splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012274whitespace string is a separator and empty strings are\n\
12275removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276
12277static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012278unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279{
12280 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012281 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
Martin v. Löwis18e16552006-02-15 17:27:45 +000012283 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284 return NULL;
12285
12286 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012289 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012291 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292}
12293
Thomas Wouters477c8d52006-05-27 19:21:47 +000012294PyObject *
12295PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12296{
12297 PyObject* str_obj;
12298 PyObject* sep_obj;
12299 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 int kind1, kind2, kind;
12301 void *buf1 = NULL, *buf2 = NULL;
12302 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012303
12304 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012305 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012306 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012307 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012309 Py_DECREF(str_obj);
12310 return NULL;
12311 }
12312
Victor Stinner14f8f022011-10-05 20:58:25 +020012313 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012315 kind = Py_MAX(kind1, kind2);
12316 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012318 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 if (!buf1)
12320 goto onError;
12321 buf2 = PyUnicode_DATA(sep_obj);
12322 if (kind2 != kind)
12323 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12324 if (!buf2)
12325 goto onError;
12326 len1 = PyUnicode_GET_LENGTH(str_obj);
12327 len2 = PyUnicode_GET_LENGTH(sep_obj);
12328
Victor Stinner14f8f022011-10-05 20:58:25 +020012329 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012331 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12332 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12333 else
12334 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 break;
12336 case PyUnicode_2BYTE_KIND:
12337 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12338 break;
12339 case PyUnicode_4BYTE_KIND:
12340 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12341 break;
12342 default:
12343 assert(0);
12344 out = 0;
12345 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012346
12347 Py_DECREF(sep_obj);
12348 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 if (kind1 != kind)
12350 PyMem_Free(buf1);
12351 if (kind2 != kind)
12352 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012353
12354 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 onError:
12356 Py_DECREF(sep_obj);
12357 Py_DECREF(str_obj);
12358 if (kind1 != kind && buf1)
12359 PyMem_Free(buf1);
12360 if (kind2 != kind && buf2)
12361 PyMem_Free(buf2);
12362 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012363}
12364
12365
12366PyObject *
12367PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12368{
12369 PyObject* str_obj;
12370 PyObject* sep_obj;
12371 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 int kind1, kind2, kind;
12373 void *buf1 = NULL, *buf2 = NULL;
12374 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012375
12376 str_obj = PyUnicode_FromObject(str_in);
12377 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012378 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012379 sep_obj = PyUnicode_FromObject(sep_in);
12380 if (!sep_obj) {
12381 Py_DECREF(str_obj);
12382 return NULL;
12383 }
12384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 kind1 = PyUnicode_KIND(str_in);
12386 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012387 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 buf1 = PyUnicode_DATA(str_in);
12389 if (kind1 != kind)
12390 buf1 = _PyUnicode_AsKind(str_in, kind);
12391 if (!buf1)
12392 goto onError;
12393 buf2 = PyUnicode_DATA(sep_obj);
12394 if (kind2 != kind)
12395 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12396 if (!buf2)
12397 goto onError;
12398 len1 = PyUnicode_GET_LENGTH(str_obj);
12399 len2 = PyUnicode_GET_LENGTH(sep_obj);
12400
12401 switch(PyUnicode_KIND(str_in)) {
12402 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012403 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12404 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12405 else
12406 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407 break;
12408 case PyUnicode_2BYTE_KIND:
12409 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12410 break;
12411 case PyUnicode_4BYTE_KIND:
12412 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12413 break;
12414 default:
12415 assert(0);
12416 out = 0;
12417 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012418
12419 Py_DECREF(sep_obj);
12420 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012421 if (kind1 != kind)
12422 PyMem_Free(buf1);
12423 if (kind2 != kind)
12424 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012425
12426 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 onError:
12428 Py_DECREF(sep_obj);
12429 Py_DECREF(str_obj);
12430 if (kind1 != kind && buf1)
12431 PyMem_Free(buf1);
12432 if (kind2 != kind && buf2)
12433 PyMem_Free(buf2);
12434 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012435}
12436
12437PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012439\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012440Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012441the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012442found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012443
12444static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012445unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012446{
Victor Stinner9310abb2011-10-05 00:59:23 +020012447 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012448}
12449
12450PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012451 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012452\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012453Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012454the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012455separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012456
12457static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012458unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012459{
Victor Stinner9310abb2011-10-05 00:59:23 +020012460 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012461}
12462
Alexander Belopolsky40018472011-02-26 01:02:56 +000012463PyObject *
12464PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012465{
12466 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012467
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012468 s = PyUnicode_FromObject(s);
12469 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012470 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012471 if (sep != NULL) {
12472 sep = PyUnicode_FromObject(sep);
12473 if (sep == NULL) {
12474 Py_DECREF(s);
12475 return NULL;
12476 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012477 }
12478
Victor Stinner9310abb2011-10-05 00:59:23 +020012479 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012480
12481 Py_DECREF(s);
12482 Py_XDECREF(sep);
12483 return result;
12484}
12485
12486PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012488\n\
12489Return a list of the words in S, using sep as the\n\
12490delimiter string, starting at the end of the string and\n\
12491working to the front. If maxsplit is given, at most maxsplit\n\
12492splits are done. If sep is not specified, any whitespace string\n\
12493is a separator.");
12494
12495static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012496unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012497{
12498 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012499 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012500
Martin v. Löwis18e16552006-02-15 17:27:45 +000012501 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012502 return NULL;
12503
12504 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012505 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012506 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012507 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012508 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012509 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012510}
12511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012512PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012513 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514\n\
12515Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012516Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012517is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518
12519static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012520unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012522 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012523 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012525 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12526 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527 return NULL;
12528
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012529 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530}
12531
12532static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012533PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534{
Walter Dörwald346737f2007-05-31 10:44:43 +000012535 if (PyUnicode_CheckExact(self)) {
12536 Py_INCREF(self);
12537 return self;
12538 } else
12539 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012540 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541}
12542
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012543PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012544 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545\n\
12546Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012547and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548
12549static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012550unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552 return fixup(self, fixswapcase);
12553}
12554
Georg Brandlceee0772007-11-27 23:48:05 +000012555PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012556 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012557\n\
12558Return a translation table usable for str.translate().\n\
12559If there is only one argument, it must be a dictionary mapping Unicode\n\
12560ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012561Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012562If there are two arguments, they must be strings of equal length, and\n\
12563in the resulting dictionary, each character in x will be mapped to the\n\
12564character at the same position in y. If there is a third argument, it\n\
12565must be a string, whose characters will be mapped to None in the result.");
12566
12567static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012568unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012569{
12570 PyObject *x, *y = NULL, *z = NULL;
12571 PyObject *new = NULL, *key, *value;
12572 Py_ssize_t i = 0;
12573 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012574
Georg Brandlceee0772007-11-27 23:48:05 +000012575 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12576 return NULL;
12577 new = PyDict_New();
12578 if (!new)
12579 return NULL;
12580 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 int x_kind, y_kind, z_kind;
12582 void *x_data, *y_data, *z_data;
12583
Georg Brandlceee0772007-11-27 23:48:05 +000012584 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012585 if (!PyUnicode_Check(x)) {
12586 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12587 "be a string if there is a second argument");
12588 goto err;
12589 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012591 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12592 "arguments must have equal length");
12593 goto err;
12594 }
12595 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012596 x_kind = PyUnicode_KIND(x);
12597 y_kind = PyUnicode_KIND(y);
12598 x_data = PyUnicode_DATA(x);
12599 y_data = PyUnicode_DATA(y);
12600 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12601 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12602 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012603 if (!key || !value)
12604 goto err;
12605 res = PyDict_SetItem(new, key, value);
12606 Py_DECREF(key);
12607 Py_DECREF(value);
12608 if (res < 0)
12609 goto err;
12610 }
12611 /* create entries for deleting chars in z */
12612 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 z_kind = PyUnicode_KIND(z);
12614 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012615 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012617 if (!key)
12618 goto err;
12619 res = PyDict_SetItem(new, key, Py_None);
12620 Py_DECREF(key);
12621 if (res < 0)
12622 goto err;
12623 }
12624 }
12625 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 int kind;
12627 void *data;
12628
Georg Brandlceee0772007-11-27 23:48:05 +000012629 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012630 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012631 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12632 "to maketrans it must be a dict");
12633 goto err;
12634 }
12635 /* copy entries into the new dict, converting string keys to int keys */
12636 while (PyDict_Next(x, &i, &key, &value)) {
12637 if (PyUnicode_Check(key)) {
12638 /* convert string keys to integer keys */
12639 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012640 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012641 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12642 "table must be of length 1");
12643 goto err;
12644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 kind = PyUnicode_KIND(key);
12646 data = PyUnicode_DATA(key);
12647 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012648 if (!newkey)
12649 goto err;
12650 res = PyDict_SetItem(new, newkey, value);
12651 Py_DECREF(newkey);
12652 if (res < 0)
12653 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012654 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012655 /* just keep integer keys */
12656 if (PyDict_SetItem(new, key, value) < 0)
12657 goto err;
12658 } else {
12659 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12660 "be strings or integers");
12661 goto err;
12662 }
12663 }
12664 }
12665 return new;
12666 err:
12667 Py_DECREF(new);
12668 return NULL;
12669}
12670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012671PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012672 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012673\n\
12674Return a copy of the string S, where all characters have been mapped\n\
12675through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012676Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012677Unmapped characters are left untouched. Characters mapped to None\n\
12678are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679
12680static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684}
12685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012686PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012689Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690
12691static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012692unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694 return fixup(self, fixupper);
12695}
12696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012697PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012698 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012700Pad a numeric string S with zeros on the left, to fill a field\n\
12701of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702
12703static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012704unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012706 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012707 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012708 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 int kind;
12710 void *data;
12711 Py_UCS4 chr;
12712
12713 if (PyUnicode_READY(self) == -1)
12714 return NULL;
12715
Martin v. Löwis18e16552006-02-15 17:27:45 +000012716 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012717 return NULL;
12718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012720 if (PyUnicode_CheckExact(self)) {
12721 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012722 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012723 }
12724 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012725 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726 }
12727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012728 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012729
12730 u = pad(self, fill, 0, '0');
12731
Walter Dörwald068325e2002-04-15 13:36:47 +000012732 if (u == NULL)
12733 return NULL;
12734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 kind = PyUnicode_KIND(u);
12736 data = PyUnicode_DATA(u);
12737 chr = PyUnicode_READ(kind, data, fill);
12738
12739 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 PyUnicode_WRITE(kind, data, 0, chr);
12742 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743 }
12744
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012745 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012746 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748
12749#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012750static PyObject *
12751unicode__decimal2ascii(PyObject *self)
12752{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012754}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012755#endif
12756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012757PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012758 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012760Return True if S starts with the specified prefix, False otherwise.\n\
12761With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012762With optional end, stop comparing S at that position.\n\
12763prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764
12765static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012766unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012767 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012769 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012770 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012771 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012772 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012773 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012774
Jesus Ceaac451502011-04-20 17:09:23 +020012775 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012776 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012777 if (PyTuple_Check(subobj)) {
12778 Py_ssize_t i;
12779 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012780 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012781 if (substring == NULL)
12782 return NULL;
12783 result = tailmatch(self, substring, start, end, -1);
12784 Py_DECREF(substring);
12785 if (result) {
12786 Py_RETURN_TRUE;
12787 }
12788 }
12789 /* nothing matched */
12790 Py_RETURN_FALSE;
12791 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012792 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012793 if (substring == NULL) {
12794 if (PyErr_ExceptionMatches(PyExc_TypeError))
12795 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12796 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012798 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012799 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012800 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012801 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802}
12803
12804
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012805PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012806 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012808Return True if S ends with the specified suffix, False otherwise.\n\
12809With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012810With optional end, stop comparing S at that position.\n\
12811suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812
12813static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012814unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012817 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012818 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012819 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012820 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012821 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822
Jesus Ceaac451502011-04-20 17:09:23 +020012823 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012825 if (PyTuple_Check(subobj)) {
12826 Py_ssize_t i;
12827 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012828 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012830 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012831 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012832 result = tailmatch(self, substring, start, end, +1);
12833 Py_DECREF(substring);
12834 if (result) {
12835 Py_RETURN_TRUE;
12836 }
12837 }
12838 Py_RETURN_FALSE;
12839 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012840 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012841 if (substring == NULL) {
12842 if (PyErr_ExceptionMatches(PyExc_TypeError))
12843 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12844 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012845 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012846 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012847 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012849 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850}
12851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012852#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012853
12854PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012855 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012856\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012857Return a formatted version of S, using substitutions from args and kwargs.\n\
12858The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012859
Eric Smith27bbca62010-11-04 17:06:58 +000012860PyDoc_STRVAR(format_map__doc__,
12861 "S.format_map(mapping) -> str\n\
12862\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012863Return a formatted version of S, using substitutions from mapping.\n\
12864The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012865
Eric Smith4a7d76d2008-05-30 18:10:19 +000012866static PyObject *
12867unicode__format__(PyObject* self, PyObject* args)
12868{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012869 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012870
12871 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12872 return NULL;
12873
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012874 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012876 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012877}
12878
Eric Smith8c663262007-08-25 02:26:07 +000012879PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012880 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012881\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012882Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012883
12884static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012885unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012886{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012887 Py_ssize_t size;
12888
12889 /* If it's a compact object, account for base structure +
12890 character data. */
12891 if (PyUnicode_IS_COMPACT_ASCII(v))
12892 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12893 else if (PyUnicode_IS_COMPACT(v))
12894 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012895 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896 else {
12897 /* If it is a two-block object, account for base object, and
12898 for character block if present. */
12899 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012900 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012901 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012902 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012903 }
12904 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012905 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012906 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012908 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012909 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910
12911 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012912}
12913
12914PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012915 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012916
12917static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012918unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012919{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012920 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012921 if (!copy)
12922 return NULL;
12923 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012924}
12925
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926static PyMethodDef unicode_methods[] = {
12927
12928 /* Order is according to common usage: often used methods should
12929 appear first, since lookup is done sequentially. */
12930
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012931 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012932 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12933 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012934 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012935 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12936 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12937 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12938 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12939 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12940 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12941 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012942 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012943 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12944 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12945 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012946 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012947 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12948 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12949 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012950 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012951 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012952 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012953 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012954 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12955 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12956 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12957 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12958 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12959 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12960 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12961 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12962 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12963 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12964 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12965 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12966 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12967 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012968 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012969 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012970 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012971 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012972 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012973 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012974 {"maketrans", (PyCFunction) unicode_maketrans,
12975 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012976 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012977#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012978 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012979#endif
12980
12981#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012982 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012983 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984#endif
12985
Benjamin Peterson14339b62009-01-31 16:36:08 +000012986 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987 {NULL, NULL}
12988};
12989
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012990static PyObject *
12991unicode_mod(PyObject *v, PyObject *w)
12992{
Brian Curtindfc80e32011-08-10 20:28:54 -050012993 if (!PyUnicode_Check(v))
12994 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012995 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012996}
12997
12998static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012999 0, /*nb_add*/
13000 0, /*nb_subtract*/
13001 0, /*nb_multiply*/
13002 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013003};
13004
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013006 (lenfunc) unicode_length, /* sq_length */
13007 PyUnicode_Concat, /* sq_concat */
13008 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13009 (ssizeargfunc) unicode_getitem, /* sq_item */
13010 0, /* sq_slice */
13011 0, /* sq_ass_item */
13012 0, /* sq_ass_slice */
13013 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013014};
13015
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013016static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013017unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 if (PyUnicode_READY(self) == -1)
13020 return NULL;
13021
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013022 if (PyIndex_Check(item)) {
13023 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013024 if (i == -1 && PyErr_Occurred())
13025 return NULL;
13026 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013028 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013029 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013030 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013031 PyObject *result;
13032 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013033 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013034 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013037 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013038 return NULL;
13039 }
13040
13041 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 return PyUnicode_New(0, 0);
13043 } else if (start == 0 && step == 1 &&
13044 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013045 PyUnicode_CheckExact(self)) {
13046 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013047 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013048 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013049 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013050 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013051 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013052 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013053 src_kind = PyUnicode_KIND(self);
13054 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013055 if (!PyUnicode_IS_ASCII(self)) {
13056 kind_limit = kind_maxchar_limit(src_kind);
13057 max_char = 0;
13058 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13059 ch = PyUnicode_READ(src_kind, src_data, cur);
13060 if (ch > max_char) {
13061 max_char = ch;
13062 if (max_char >= kind_limit)
13063 break;
13064 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013065 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013066 }
Victor Stinner55c99112011-10-13 01:17:06 +020013067 else
13068 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013069 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013070 if (result == NULL)
13071 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013072 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013073 dest_data = PyUnicode_DATA(result);
13074
13075 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013076 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13077 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013078 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013079 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013080 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013081 } else {
13082 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13083 return NULL;
13084 }
13085}
13086
13087static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013088 (lenfunc)unicode_length, /* mp_length */
13089 (binaryfunc)unicode_subscript, /* mp_subscript */
13090 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013091};
13092
Guido van Rossumd57fd912000-03-10 22:53:23 +000013093
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094/* Helpers for PyUnicode_Format() */
13095
13096static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013097getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013099 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013101 (*p_argidx)++;
13102 if (arglen < 0)
13103 return args;
13104 else
13105 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106 }
13107 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109 return NULL;
13110}
13111
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013112/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013114static PyObject *
13115formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013117 char *p;
13118 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013120
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121 x = PyFloat_AsDouble(v);
13122 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013123 return NULL;
13124
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013127
Eric Smith0923d1d2009-04-16 20:16:10 +000013128 p = PyOS_double_to_string(x, type, prec,
13129 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013130 if (p == NULL)
13131 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013133 PyMem_Free(p);
13134 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135}
13136
Tim Peters38fd5b62000-09-21 05:43:11 +000013137static PyObject*
13138formatlong(PyObject *val, int flags, int prec, int type)
13139{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013140 char *buf;
13141 int len;
13142 PyObject *str; /* temporary string object. */
13143 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013144
Benjamin Peterson14339b62009-01-31 16:36:08 +000013145 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13146 if (!str)
13147 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013148 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013149 Py_DECREF(str);
13150 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013151}
13152
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013153static Py_UCS4
13154formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013156 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013157 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013159 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013160 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013161 goto onError;
13162 }
13163 else {
13164 /* Integer input truncated to a character */
13165 long x;
13166 x = PyLong_AsLong(v);
13167 if (x == -1 && PyErr_Occurred())
13168 goto onError;
13169
13170 if (x < 0 || x > 0x10ffff) {
13171 PyErr_SetString(PyExc_OverflowError,
13172 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013173 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013174 }
13175
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013176 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013177 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013178
Benjamin Peterson29060642009-01-31 22:14:21 +000013179 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013180 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013181 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013182 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183}
13184
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013185static int
13186repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13187{
13188 int r;
13189 assert(count > 0);
13190 assert(PyUnicode_Check(obj));
13191 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013192 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013193 if (repeated == NULL)
13194 return -1;
13195 r = _PyAccu_Accumulate(acc, repeated);
13196 Py_DECREF(repeated);
13197 return r;
13198 }
13199 else {
13200 do {
13201 if (_PyAccu_Accumulate(acc, obj))
13202 return -1;
13203 } while (--count);
13204 return 0;
13205 }
13206}
13207
Alexander Belopolsky40018472011-02-26 01:02:56 +000013208PyObject *
13209PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211 void *fmt;
13212 int fmtkind;
13213 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013214 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013215 int r;
13216 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013218 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013219 PyObject *temp = NULL;
13220 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013221 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013222 _PyAccu acc;
13223 static PyObject *plus, *minus, *blank, *zero, *percent;
13224
13225 if (!plus && !(plus = get_latin1_char('+')))
13226 return NULL;
13227 if (!minus && !(minus = get_latin1_char('-')))
13228 return NULL;
13229 if (!blank && !(blank = get_latin1_char(' ')))
13230 return NULL;
13231 if (!zero && !(zero = get_latin1_char('0')))
13232 return NULL;
13233 if (!percent && !(percent = get_latin1_char('%')))
13234 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013235
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013237 PyErr_BadInternalCall();
13238 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013240 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013241 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013243 if (_PyAccu_Init(&acc))
13244 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 fmt = PyUnicode_DATA(uformat);
13246 fmtkind = PyUnicode_KIND(uformat);
13247 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13248 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013251 arglen = PyTuple_Size(args);
13252 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253 }
13254 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013255 arglen = -1;
13256 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013258 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013259 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013260 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013261
13262 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013263 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013264 PyObject *nonfmt;
13265 Py_ssize_t nonfmtpos;
13266 nonfmtpos = fmtpos++;
13267 while (fmtcnt >= 0 &&
13268 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13269 fmtpos++;
13270 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013271 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013272 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013273 if (nonfmt == NULL)
13274 goto onError;
13275 r = _PyAccu_Accumulate(&acc, nonfmt);
13276 Py_DECREF(nonfmt);
13277 if (r)
13278 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013279 }
13280 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013281 /* Got a format specifier */
13282 int flags = 0;
13283 Py_ssize_t width = -1;
13284 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013285 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013286 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013287 int isnumok;
13288 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013289 void *pbuf = NULL;
13290 Py_ssize_t pindex, len;
13291 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293 fmtpos++;
13294 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13295 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013296 Py_ssize_t keylen;
13297 PyObject *key;
13298 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013299
Benjamin Peterson29060642009-01-31 22:14:21 +000013300 if (dict == NULL) {
13301 PyErr_SetString(PyExc_TypeError,
13302 "format requires a mapping");
13303 goto onError;
13304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013306 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013307 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013308 /* Skip over balanced parentheses */
13309 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013310 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013311 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013314 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013316 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013317 if (fmtcnt < 0 || pcount > 0) {
13318 PyErr_SetString(PyExc_ValueError,
13319 "incomplete format key");
13320 goto onError;
13321 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013322 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013323 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013324 if (key == NULL)
13325 goto onError;
13326 if (args_owned) {
13327 Py_DECREF(args);
13328 args_owned = 0;
13329 }
13330 args = PyObject_GetItem(dict, key);
13331 Py_DECREF(key);
13332 if (args == NULL) {
13333 goto onError;
13334 }
13335 args_owned = 1;
13336 arglen = -1;
13337 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013338 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013339 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013340 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013341 case '-': flags |= F_LJUST; continue;
13342 case '+': flags |= F_SIGN; continue;
13343 case ' ': flags |= F_BLANK; continue;
13344 case '#': flags |= F_ALT; continue;
13345 case '0': flags |= F_ZERO; continue;
13346 }
13347 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013348 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013349 if (c == '*') {
13350 v = getnextarg(args, arglen, &argidx);
13351 if (v == NULL)
13352 goto onError;
13353 if (!PyLong_Check(v)) {
13354 PyErr_SetString(PyExc_TypeError,
13355 "* wants int");
13356 goto onError;
13357 }
13358 width = PyLong_AsLong(v);
13359 if (width == -1 && PyErr_Occurred())
13360 goto onError;
13361 if (width < 0) {
13362 flags |= F_LJUST;
13363 width = -width;
13364 }
13365 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013366 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013367 }
13368 else if (c >= '0' && c <= '9') {
13369 width = c - '0';
13370 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 if (c < '0' || c > '9')
13373 break;
13374 if ((width*10) / 10 != width) {
13375 PyErr_SetString(PyExc_ValueError,
13376 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013377 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013378 }
13379 width = width*10 + (c - '0');
13380 }
13381 }
13382 if (c == '.') {
13383 prec = 0;
13384 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013385 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013386 if (c == '*') {
13387 v = getnextarg(args, arglen, &argidx);
13388 if (v == NULL)
13389 goto onError;
13390 if (!PyLong_Check(v)) {
13391 PyErr_SetString(PyExc_TypeError,
13392 "* wants int");
13393 goto onError;
13394 }
13395 prec = PyLong_AsLong(v);
13396 if (prec == -1 && PyErr_Occurred())
13397 goto onError;
13398 if (prec < 0)
13399 prec = 0;
13400 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013401 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013402 }
13403 else if (c >= '0' && c <= '9') {
13404 prec = c - '0';
13405 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013406 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 if (c < '0' || c > '9')
13408 break;
13409 if ((prec*10) / 10 != prec) {
13410 PyErr_SetString(PyExc_ValueError,
13411 "prec too big");
13412 goto onError;
13413 }
13414 prec = prec*10 + (c - '0');
13415 }
13416 }
13417 } /* prec */
13418 if (fmtcnt >= 0) {
13419 if (c == 'h' || c == 'l' || c == 'L') {
13420 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013421 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 }
13423 }
13424 if (fmtcnt < 0) {
13425 PyErr_SetString(PyExc_ValueError,
13426 "incomplete format");
13427 goto onError;
13428 }
13429 if (c != '%') {
13430 v = getnextarg(args, arglen, &argidx);
13431 if (v == NULL)
13432 goto onError;
13433 }
13434 sign = 0;
13435 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013436 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013437 switch (c) {
13438
13439 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013440 _PyAccu_Accumulate(&acc, percent);
13441 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013442
13443 case 's':
13444 case 'r':
13445 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013446 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 temp = v;
13448 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013449 }
13450 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 if (c == 's')
13452 temp = PyObject_Str(v);
13453 else if (c == 'r')
13454 temp = PyObject_Repr(v);
13455 else
13456 temp = PyObject_ASCII(v);
13457 if (temp == NULL)
13458 goto onError;
13459 if (PyUnicode_Check(temp))
13460 /* nothing to do */;
13461 else {
13462 Py_DECREF(temp);
13463 PyErr_SetString(PyExc_TypeError,
13464 "%s argument has non-string str()");
13465 goto onError;
13466 }
13467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013468 if (PyUnicode_READY(temp) == -1) {
13469 Py_CLEAR(temp);
13470 goto onError;
13471 }
13472 pbuf = PyUnicode_DATA(temp);
13473 kind = PyUnicode_KIND(temp);
13474 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 if (prec >= 0 && len > prec)
13476 len = prec;
13477 break;
13478
13479 case 'i':
13480 case 'd':
13481 case 'u':
13482 case 'o':
13483 case 'x':
13484 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013485 isnumok = 0;
13486 if (PyNumber_Check(v)) {
13487 PyObject *iobj=NULL;
13488
13489 if (PyLong_Check(v)) {
13490 iobj = v;
13491 Py_INCREF(iobj);
13492 }
13493 else {
13494 iobj = PyNumber_Long(v);
13495 }
13496 if (iobj!=NULL) {
13497 if (PyLong_Check(iobj)) {
13498 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013499 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013500 Py_DECREF(iobj);
13501 if (!temp)
13502 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013503 if (PyUnicode_READY(temp) == -1) {
13504 Py_CLEAR(temp);
13505 goto onError;
13506 }
13507 pbuf = PyUnicode_DATA(temp);
13508 kind = PyUnicode_KIND(temp);
13509 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 sign = 1;
13511 }
13512 else {
13513 Py_DECREF(iobj);
13514 }
13515 }
13516 }
13517 if (!isnumok) {
13518 PyErr_Format(PyExc_TypeError,
13519 "%%%c format: a number is required, "
13520 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13521 goto onError;
13522 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013523 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013524 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013525 fillobj = zero;
13526 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013527 break;
13528
13529 case 'e':
13530 case 'E':
13531 case 'f':
13532 case 'F':
13533 case 'g':
13534 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013535 temp = formatfloat(v, flags, prec, c);
13536 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013537 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013538 if (PyUnicode_READY(temp) == -1) {
13539 Py_CLEAR(temp);
13540 goto onError;
13541 }
13542 pbuf = PyUnicode_DATA(temp);
13543 kind = PyUnicode_KIND(temp);
13544 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013546 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013547 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013548 fillobj = zero;
13549 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013550 break;
13551
13552 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013553 {
13554 Py_UCS4 ch = formatchar(v);
13555 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013557 temp = _PyUnicode_FromUCS4(&ch, 1);
13558 if (temp == NULL)
13559 goto onError;
13560 pbuf = PyUnicode_DATA(temp);
13561 kind = PyUnicode_KIND(temp);
13562 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013563 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013564 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013565
13566 default:
13567 PyErr_Format(PyExc_ValueError,
13568 "unsupported format character '%c' (0x%x) "
13569 "at index %zd",
13570 (31<=c && c<=126) ? (char)c : '?',
13571 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013572 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013573 goto onError;
13574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013575 /* pbuf is initialized here. */
13576 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013578 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13579 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013580 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013581 pindex++;
13582 }
13583 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13584 signobj = plus;
13585 len--;
13586 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013587 }
13588 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013589 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013590 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013591 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013592 else
13593 sign = 0;
13594 }
13595 if (width < len)
13596 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013598 if (fill != ' ') {
13599 assert(signobj != NULL);
13600 if (_PyAccu_Accumulate(&acc, signobj))
13601 goto onError;
13602 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013603 if (width > len)
13604 width--;
13605 }
13606 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013607 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013608 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013610 second = get_latin1_char(
13611 PyUnicode_READ(kind, pbuf, pindex + 1));
13612 pindex += 2;
13613 if (second == NULL ||
13614 _PyAccu_Accumulate(&acc, zero) ||
13615 _PyAccu_Accumulate(&acc, second))
13616 goto onError;
13617 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013618 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013619 width -= 2;
13620 if (width < 0)
13621 width = 0;
13622 len -= 2;
13623 }
13624 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013625 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013626 if (repeat_accumulate(&acc, fillobj, width - len))
13627 goto onError;
13628 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013629 }
13630 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013631 if (sign) {
13632 assert(signobj != NULL);
13633 if (_PyAccu_Accumulate(&acc, signobj))
13634 goto onError;
13635 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013636 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013637 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13638 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013639 second = get_latin1_char(
13640 PyUnicode_READ(kind, pbuf, pindex + 1));
13641 pindex += 2;
13642 if (second == NULL ||
13643 _PyAccu_Accumulate(&acc, zero) ||
13644 _PyAccu_Accumulate(&acc, second))
13645 goto onError;
13646 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013647 }
13648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013649 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013650 if (temp != NULL) {
13651 assert(pbuf == PyUnicode_DATA(temp));
13652 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013653 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013654 else {
13655 const char *p = (const char *) pbuf;
13656 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013657 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013658 v = PyUnicode_FromKindAndData(kind, p, len);
13659 }
13660 if (v == NULL)
13661 goto onError;
13662 r = _PyAccu_Accumulate(&acc, v);
13663 Py_DECREF(v);
13664 if (r)
13665 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013666 if (width > len && repeat_accumulate(&acc, blank, width - len))
13667 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013668 if (dict && (argidx < arglen) && c != '%') {
13669 PyErr_SetString(PyExc_TypeError,
13670 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 goto onError;
13672 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013673 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013674 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013675 } /* until end */
13676 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013677 PyErr_SetString(PyExc_TypeError,
13678 "not all arguments converted during string formatting");
13679 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013680 }
13681
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013682 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013683 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013684 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013685 }
13686 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013687 Py_XDECREF(temp);
13688 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013689 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013690
Benjamin Peterson29060642009-01-31 22:14:21 +000013691 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013692 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013693 Py_XDECREF(temp);
13694 Py_XDECREF(second);
13695 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013696 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013697 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013698 }
13699 return NULL;
13700}
13701
Jeremy Hylton938ace62002-07-17 16:30:39 +000013702static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013703unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13704
Tim Peters6d6c1a32001-08-02 04:15:00 +000013705static PyObject *
13706unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13707{
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013709 static char *kwlist[] = {"object", "encoding", "errors", 0};
13710 char *encoding = NULL;
13711 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013712
Benjamin Peterson14339b62009-01-31 16:36:08 +000013713 if (type != &PyUnicode_Type)
13714 return unicode_subtype_new(type, args, kwds);
13715 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013716 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013717 return NULL;
13718 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013719 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013720 if (encoding == NULL && errors == NULL)
13721 return PyObject_Str(x);
13722 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013723 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013724}
13725
Guido van Rossume023fe02001-08-30 03:12:59 +000013726static PyObject *
13727unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13728{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013729 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013730 Py_ssize_t length, char_size;
13731 int share_wstr, share_utf8;
13732 unsigned int kind;
13733 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013734
Benjamin Peterson14339b62009-01-31 16:36:08 +000013735 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013736
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013737 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013738 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013739 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013740 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013741 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013742 return NULL;
13743
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013744 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013745 if (self == NULL) {
13746 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013747 return NULL;
13748 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013749 kind = PyUnicode_KIND(unicode);
13750 length = PyUnicode_GET_LENGTH(unicode);
13751
13752 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013753#ifdef Py_DEBUG
13754 _PyUnicode_HASH(self) = -1;
13755#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013756 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013757#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013758 _PyUnicode_STATE(self).interned = 0;
13759 _PyUnicode_STATE(self).kind = kind;
13760 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013761 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013762 _PyUnicode_STATE(self).ready = 1;
13763 _PyUnicode_WSTR(self) = NULL;
13764 _PyUnicode_UTF8_LENGTH(self) = 0;
13765 _PyUnicode_UTF8(self) = NULL;
13766 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013767 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013768
13769 share_utf8 = 0;
13770 share_wstr = 0;
13771 if (kind == PyUnicode_1BYTE_KIND) {
13772 char_size = 1;
13773 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13774 share_utf8 = 1;
13775 }
13776 else if (kind == PyUnicode_2BYTE_KIND) {
13777 char_size = 2;
13778 if (sizeof(wchar_t) == 2)
13779 share_wstr = 1;
13780 }
13781 else {
13782 assert(kind == PyUnicode_4BYTE_KIND);
13783 char_size = 4;
13784 if (sizeof(wchar_t) == 4)
13785 share_wstr = 1;
13786 }
13787
13788 /* Ensure we won't overflow the length. */
13789 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13790 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013791 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013792 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013793 data = PyObject_MALLOC((length + 1) * char_size);
13794 if (data == NULL) {
13795 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013796 goto onError;
13797 }
13798
Victor Stinnerc3c74152011-10-02 20:39:55 +020013799 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013800 if (share_utf8) {
13801 _PyUnicode_UTF8_LENGTH(self) = length;
13802 _PyUnicode_UTF8(self) = data;
13803 }
13804 if (share_wstr) {
13805 _PyUnicode_WSTR_LENGTH(self) = length;
13806 _PyUnicode_WSTR(self) = (wchar_t *)data;
13807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013809 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013810 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013811 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013812#ifdef Py_DEBUG
13813 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13814#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013815 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013816 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013817
13818onError:
13819 Py_DECREF(unicode);
13820 Py_DECREF(self);
13821 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013822}
13823
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013824PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013825 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013826\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013827Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013828encoding defaults to the current default string encoding.\n\
13829errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013830
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013831static PyObject *unicode_iter(PyObject *seq);
13832
Guido van Rossumd57fd912000-03-10 22:53:23 +000013833PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013834 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013835 "str", /* tp_name */
13836 sizeof(PyUnicodeObject), /* tp_size */
13837 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013838 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013839 (destructor)unicode_dealloc, /* tp_dealloc */
13840 0, /* tp_print */
13841 0, /* tp_getattr */
13842 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013843 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013844 unicode_repr, /* tp_repr */
13845 &unicode_as_number, /* tp_as_number */
13846 &unicode_as_sequence, /* tp_as_sequence */
13847 &unicode_as_mapping, /* tp_as_mapping */
13848 (hashfunc) unicode_hash, /* tp_hash*/
13849 0, /* tp_call*/
13850 (reprfunc) unicode_str, /* tp_str */
13851 PyObject_GenericGetAttr, /* tp_getattro */
13852 0, /* tp_setattro */
13853 0, /* tp_as_buffer */
13854 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013855 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013856 unicode_doc, /* tp_doc */
13857 0, /* tp_traverse */
13858 0, /* tp_clear */
13859 PyUnicode_RichCompare, /* tp_richcompare */
13860 0, /* tp_weaklistoffset */
13861 unicode_iter, /* tp_iter */
13862 0, /* tp_iternext */
13863 unicode_methods, /* tp_methods */
13864 0, /* tp_members */
13865 0, /* tp_getset */
13866 &PyBaseObject_Type, /* tp_base */
13867 0, /* tp_dict */
13868 0, /* tp_descr_get */
13869 0, /* tp_descr_set */
13870 0, /* tp_dictoffset */
13871 0, /* tp_init */
13872 0, /* tp_alloc */
13873 unicode_new, /* tp_new */
13874 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013875};
13876
13877/* Initialize the Unicode implementation */
13878
Victor Stinner3a50e702011-10-18 21:21:00 +020013879int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013880{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013881 int i;
13882
Thomas Wouters477c8d52006-05-27 19:21:47 +000013883 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013884 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013885 0x000A, /* LINE FEED */
13886 0x000D, /* CARRIAGE RETURN */
13887 0x001C, /* FILE SEPARATOR */
13888 0x001D, /* GROUP SEPARATOR */
13889 0x001E, /* RECORD SEPARATOR */
13890 0x0085, /* NEXT LINE */
13891 0x2028, /* LINE SEPARATOR */
13892 0x2029, /* PARAGRAPH SEPARATOR */
13893 };
13894
Fred Drakee4315f52000-05-09 19:53:39 +000013895 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013896 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013897 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013898 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013899 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013900
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013901 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013902 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013903 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013904 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013905
13906 /* initialize the linebreak bloom filter */
13907 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013908 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013909 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013910
13911 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013912
13913#ifdef HAVE_MBCS
13914 winver.dwOSVersionInfoSize = sizeof(winver);
13915 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13916 PyErr_SetFromWindowsErr(0);
13917 return -1;
13918 }
13919#endif
13920 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013921}
13922
13923/* Finalize the Unicode implementation */
13924
Christian Heimesa156e092008-02-16 07:38:31 +000013925int
13926PyUnicode_ClearFreeList(void)
13927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013928 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013929}
13930
Guido van Rossumd57fd912000-03-10 22:53:23 +000013931void
Thomas Wouters78890102000-07-22 19:25:51 +000013932_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013933{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013934 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013935
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013936 Py_XDECREF(unicode_empty);
13937 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013938
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013939 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013940 if (unicode_latin1[i]) {
13941 Py_DECREF(unicode_latin1[i]);
13942 unicode_latin1[i] = NULL;
13943 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013944 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013945 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013946 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013947}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013948
Walter Dörwald16807132007-05-25 13:52:07 +000013949void
13950PyUnicode_InternInPlace(PyObject **p)
13951{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013952 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013953 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013954#ifdef Py_DEBUG
13955 assert(s != NULL);
13956 assert(_PyUnicode_CHECK(s));
13957#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013959 return;
13960#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013961 /* If it's a subclass, we don't really know what putting
13962 it in the interned dict might do. */
13963 if (!PyUnicode_CheckExact(s))
13964 return;
13965 if (PyUnicode_CHECK_INTERNED(s))
13966 return;
13967 if (interned == NULL) {
13968 interned = PyDict_New();
13969 if (interned == NULL) {
13970 PyErr_Clear(); /* Don't leave an exception */
13971 return;
13972 }
13973 }
13974 /* It might be that the GetItem call fails even
13975 though the key is present in the dictionary,
13976 namely when this happens during a stack overflow. */
13977 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013978 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013980
Benjamin Peterson29060642009-01-31 22:14:21 +000013981 if (t) {
13982 Py_INCREF(t);
13983 Py_DECREF(*p);
13984 *p = t;
13985 return;
13986 }
Walter Dörwald16807132007-05-25 13:52:07 +000013987
Benjamin Peterson14339b62009-01-31 16:36:08 +000013988 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013989 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 PyErr_Clear();
13991 PyThreadState_GET()->recursion_critical = 0;
13992 return;
13993 }
13994 PyThreadState_GET()->recursion_critical = 0;
13995 /* The two references in interned are not counted by refcnt.
13996 The deallocator will take care of this */
13997 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013998 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013999}
14000
14001void
14002PyUnicode_InternImmortal(PyObject **p)
14003{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014004 PyUnicode_InternInPlace(p);
14005 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014006 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014007 Py_INCREF(*p);
14008 }
Walter Dörwald16807132007-05-25 13:52:07 +000014009}
14010
14011PyObject *
14012PyUnicode_InternFromString(const char *cp)
14013{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014014 PyObject *s = PyUnicode_FromString(cp);
14015 if (s == NULL)
14016 return NULL;
14017 PyUnicode_InternInPlace(&s);
14018 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014019}
14020
Alexander Belopolsky40018472011-02-26 01:02:56 +000014021void
14022_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014023{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014024 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014025 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014026 Py_ssize_t i, n;
14027 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014028
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 if (interned == NULL || !PyDict_Check(interned))
14030 return;
14031 keys = PyDict_Keys(interned);
14032 if (keys == NULL || !PyList_Check(keys)) {
14033 PyErr_Clear();
14034 return;
14035 }
Walter Dörwald16807132007-05-25 13:52:07 +000014036
Benjamin Peterson14339b62009-01-31 16:36:08 +000014037 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14038 detector, interned unicode strings are not forcibly deallocated;
14039 rather, we give them their stolen references back, and then clear
14040 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014041
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 n = PyList_GET_SIZE(keys);
14043 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014044 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014045 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014046 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014047 if (PyUnicode_READY(s) == -1) {
14048 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014049 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014051 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014052 case SSTATE_NOT_INTERNED:
14053 /* XXX Shouldn't happen */
14054 break;
14055 case SSTATE_INTERNED_IMMORTAL:
14056 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014057 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014058 break;
14059 case SSTATE_INTERNED_MORTAL:
14060 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014061 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014062 break;
14063 default:
14064 Py_FatalError("Inconsistent interned string state.");
14065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014066 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014067 }
14068 fprintf(stderr, "total size of all interned strings: "
14069 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14070 "mortal/immortal\n", mortal_size, immortal_size);
14071 Py_DECREF(keys);
14072 PyDict_Clear(interned);
14073 Py_DECREF(interned);
14074 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014075}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014076
14077
14078/********************* Unicode Iterator **************************/
14079
14080typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014081 PyObject_HEAD
14082 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014083 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014084} unicodeiterobject;
14085
14086static void
14087unicodeiter_dealloc(unicodeiterobject *it)
14088{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014089 _PyObject_GC_UNTRACK(it);
14090 Py_XDECREF(it->it_seq);
14091 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014092}
14093
14094static int
14095unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14096{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014097 Py_VISIT(it->it_seq);
14098 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014099}
14100
14101static PyObject *
14102unicodeiter_next(unicodeiterobject *it)
14103{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014104 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014105
Benjamin Peterson14339b62009-01-31 16:36:08 +000014106 assert(it != NULL);
14107 seq = it->it_seq;
14108 if (seq == NULL)
14109 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014110 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014112 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14113 int kind = PyUnicode_KIND(seq);
14114 void *data = PyUnicode_DATA(seq);
14115 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14116 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 if (item != NULL)
14118 ++it->it_index;
14119 return item;
14120 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014121
Benjamin Peterson14339b62009-01-31 16:36:08 +000014122 Py_DECREF(seq);
14123 it->it_seq = NULL;
14124 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014125}
14126
14127static PyObject *
14128unicodeiter_len(unicodeiterobject *it)
14129{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014130 Py_ssize_t len = 0;
14131 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014132 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014133 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014134}
14135
14136PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14137
14138static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014139 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014140 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014141 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014142};
14143
14144PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014145 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14146 "str_iterator", /* tp_name */
14147 sizeof(unicodeiterobject), /* tp_basicsize */
14148 0, /* tp_itemsize */
14149 /* methods */
14150 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14151 0, /* tp_print */
14152 0, /* tp_getattr */
14153 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014154 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014155 0, /* tp_repr */
14156 0, /* tp_as_number */
14157 0, /* tp_as_sequence */
14158 0, /* tp_as_mapping */
14159 0, /* tp_hash */
14160 0, /* tp_call */
14161 0, /* tp_str */
14162 PyObject_GenericGetAttr, /* tp_getattro */
14163 0, /* tp_setattro */
14164 0, /* tp_as_buffer */
14165 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14166 0, /* tp_doc */
14167 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14168 0, /* tp_clear */
14169 0, /* tp_richcompare */
14170 0, /* tp_weaklistoffset */
14171 PyObject_SelfIter, /* tp_iter */
14172 (iternextfunc)unicodeiter_next, /* tp_iternext */
14173 unicodeiter_methods, /* tp_methods */
14174 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014175};
14176
14177static PyObject *
14178unicode_iter(PyObject *seq)
14179{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014180 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014181
Benjamin Peterson14339b62009-01-31 16:36:08 +000014182 if (!PyUnicode_Check(seq)) {
14183 PyErr_BadInternalCall();
14184 return NULL;
14185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014186 if (PyUnicode_READY(seq) == -1)
14187 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014188 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14189 if (it == NULL)
14190 return NULL;
14191 it->it_index = 0;
14192 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014193 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014194 _PyObject_GC_TRACK(it);
14195 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014196}
14197
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014198
14199size_t
14200Py_UNICODE_strlen(const Py_UNICODE *u)
14201{
14202 int res = 0;
14203 while(*u++)
14204 res++;
14205 return res;
14206}
14207
14208Py_UNICODE*
14209Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14210{
14211 Py_UNICODE *u = s1;
14212 while ((*u++ = *s2++));
14213 return s1;
14214}
14215
14216Py_UNICODE*
14217Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14218{
14219 Py_UNICODE *u = s1;
14220 while ((*u++ = *s2++))
14221 if (n-- == 0)
14222 break;
14223 return s1;
14224}
14225
14226Py_UNICODE*
14227Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14228{
14229 Py_UNICODE *u1 = s1;
14230 u1 += Py_UNICODE_strlen(u1);
14231 Py_UNICODE_strcpy(u1, s2);
14232 return s1;
14233}
14234
14235int
14236Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14237{
14238 while (*s1 && *s2 && *s1 == *s2)
14239 s1++, s2++;
14240 if (*s1 && *s2)
14241 return (*s1 < *s2) ? -1 : +1;
14242 if (*s1)
14243 return 1;
14244 if (*s2)
14245 return -1;
14246 return 0;
14247}
14248
14249int
14250Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14251{
14252 register Py_UNICODE u1, u2;
14253 for (; n != 0; n--) {
14254 u1 = *s1;
14255 u2 = *s2;
14256 if (u1 != u2)
14257 return (u1 < u2) ? -1 : +1;
14258 if (u1 == '\0')
14259 return 0;
14260 s1++;
14261 s2++;
14262 }
14263 return 0;
14264}
14265
14266Py_UNICODE*
14267Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14268{
14269 const Py_UNICODE *p;
14270 for (p = s; *p; p++)
14271 if (*p == c)
14272 return (Py_UNICODE*)p;
14273 return NULL;
14274}
14275
14276Py_UNICODE*
14277Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14278{
14279 const Py_UNICODE *p;
14280 p = s + Py_UNICODE_strlen(s);
14281 while (p != s) {
14282 p--;
14283 if (*p == c)
14284 return (Py_UNICODE*)p;
14285 }
14286 return NULL;
14287}
Victor Stinner331ea922010-08-10 16:37:20 +000014288
Victor Stinner71133ff2010-09-01 23:43:53 +000014289Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014290PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014291{
Victor Stinner577db2c2011-10-11 22:12:48 +020014292 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014293 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014295 if (!PyUnicode_Check(unicode)) {
14296 PyErr_BadArgument();
14297 return NULL;
14298 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014299 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014300 if (u == NULL)
14301 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014302 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014303 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014304 PyErr_NoMemory();
14305 return NULL;
14306 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014307 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014308 size *= sizeof(Py_UNICODE);
14309 copy = PyMem_Malloc(size);
14310 if (copy == NULL) {
14311 PyErr_NoMemory();
14312 return NULL;
14313 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014314 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014315 return copy;
14316}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014317
Georg Brandl66c221e2010-10-14 07:04:07 +000014318/* A _string module, to export formatter_parser and formatter_field_name_split
14319 to the string.Formatter class implemented in Python. */
14320
14321static PyMethodDef _string_methods[] = {
14322 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14323 METH_O, PyDoc_STR("split the argument as a field name")},
14324 {"formatter_parser", (PyCFunction) formatter_parser,
14325 METH_O, PyDoc_STR("parse the argument as a format string")},
14326 {NULL, NULL}
14327};
14328
14329static struct PyModuleDef _string_module = {
14330 PyModuleDef_HEAD_INIT,
14331 "_string",
14332 PyDoc_STR("string helper module"),
14333 0,
14334 _string_methods,
14335 NULL,
14336 NULL,
14337 NULL,
14338 NULL
14339};
14340
14341PyMODINIT_FUNC
14342PyInit__string(void)
14343{
14344 return PyModule_Create(&_string_module);
14345}
14346
14347
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014348#ifdef __cplusplus
14349}
14350#endif