blob: 77bc1a9adee6bcbb48eb199d41879b7611f62463 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner910337b2011-10-03 03:20:16 +020069#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020070# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020071#else
72# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
73#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020074
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075#define _PyUnicode_UTF8(op) \
76 (((PyCompactUnicodeObject*)(op))->utf8)
77#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020078 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079 assert(PyUnicode_IS_READY(op)), \
80 PyUnicode_IS_COMPACT_ASCII(op) ? \
81 ((char*)((PyASCIIObject*)(op) + 1)) : \
82 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020083#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 (((PyCompactUnicodeObject*)(op))->utf8_length)
85#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020086 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 assert(PyUnicode_IS_READY(op)), \
88 PyUnicode_IS_COMPACT_ASCII(op) ? \
89 ((PyASCIIObject*)(op))->length : \
90 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020091#define _PyUnicode_WSTR(op) \
92 (((PyASCIIObject*)(op))->wstr)
93#define _PyUnicode_WSTR_LENGTH(op) \
94 (((PyCompactUnicodeObject*)(op))->wstr_length)
95#define _PyUnicode_LENGTH(op) \
96 (((PyASCIIObject *)(op))->length)
97#define _PyUnicode_STATE(op) \
98 (((PyASCIIObject *)(op))->state)
99#define _PyUnicode_HASH(op) \
100 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_KIND(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_GET_LENGTH(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200107#define _PyUnicode_DATA_ANY(op) \
108 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109
Victor Stinner910337b2011-10-03 03:20:16 +0200110#undef PyUnicode_READY
111#define PyUnicode_READY(op) \
112 (assert(_PyUnicode_CHECK(op)), \
113 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200114 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100115 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200116
Victor Stinnerc379ead2011-10-03 12:52:27 +0200117#define _PyUnicode_SHARE_UTF8(op) \
118 (assert(_PyUnicode_CHECK(op)), \
119 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
120 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
121#define _PyUnicode_SHARE_WSTR(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
124
Victor Stinner829c0ad2011-10-03 01:08:02 +0200125/* true if the Unicode object has an allocated UTF-8 memory block
126 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_HAS_UTF8_MEMORY(op) \
128 (assert(_PyUnicode_CHECK(op)), \
129 (!PyUnicode_IS_COMPACT_ASCII(op) \
130 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200131 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
132
Victor Stinner03490912011-10-03 23:45:12 +0200133/* true if the Unicode object has an allocated wstr memory block
134 (not shared with other data) */
135#define _PyUnicode_HAS_WSTR_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (_PyUnicode_WSTR(op) && \
138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200148 to_type *_to = (to_type *) to; \
149 const from_type *_iter = (begin); \
150 const from_type *_end = (end); \
151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
153 _iter + (n & ~ (Py_ssize_t) 3); \
154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200165/* The Unicode string has been modified: reset the hash */
166#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226
Alexander Belopolsky40018472011-02-26 01:02:56 +0000227static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200228unicode_fromascii(const unsigned char *s, Py_ssize_t size);
229static PyObject *
230_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
235
236static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000237unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000238 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100239 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
241
Alexander Belopolsky40018472011-02-26 01:02:56 +0000242static void
243raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300244 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100245 PyObject *unicode,
246 Py_ssize_t startpos, Py_ssize_t endpos,
247 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000248
Christian Heimes190d79e2008-01-30 11:58:22 +0000249/* Same for linebreaks */
250static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000253/* 0x000B, * LINE TABULATION */
254/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000257 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x001C, * FILE SEPARATOR */
259/* 0x001D, * GROUP SEPARATOR */
260/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 1, 1, 1, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000266
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000275};
276
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300277/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
278 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000279Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000280PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000282#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 /* This is actually an illegal character, so it should
286 not be passed to unichr. */
287 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288#endif
289}
290
Victor Stinner910337b2011-10-03 03:20:16 +0200291#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200292int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100293_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200294{
295 PyASCIIObject *ascii;
296 unsigned int kind;
297
298 assert(PyUnicode_Check(op));
299
300 ascii = (PyASCIIObject *)op;
301 kind = ascii->state.kind;
302
Victor Stinnera3b334d2011-10-03 13:53:37 +0200303 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200304 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200305 assert(ascii->state.ready == 1);
306 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200307 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200308 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200309 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200310
Victor Stinnera41463c2011-10-04 01:05:08 +0200311 if (ascii->state.compact == 1) {
312 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(kind == PyUnicode_1BYTE_KIND
314 || kind == PyUnicode_2BYTE_KIND
315 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200316 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 }
320 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
322
323 data = unicode->data.any;
324 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100325 assert(ascii->length == 0);
326 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->wstr != NULL);
332 assert(data == NULL);
333 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200370 /* check that the best kind is used */
371 if (check_content && kind != PyUnicode_WCHAR_KIND)
372 {
373 Py_ssize_t i;
374 Py_UCS4 maxchar = 0;
375 void *data = PyUnicode_DATA(ascii);
376 for (i=0; i < ascii->length; i++)
377 {
378 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
379 if (ch > maxchar)
380 maxchar = ch;
381 }
382 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100383 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100385 assert(maxchar <= 255);
386 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 else
388 assert(maxchar < 128);
389 }
Victor Stinner77faf692011-11-20 18:56:05 +0100390 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200391 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100392 assert(maxchar <= 0xFFFF);
393 }
394 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 assert(maxchar >= 0x10000);
Victor Stinner0d3721d2011-11-22 03:27:53 +0100396 /* FIXME: Issue #13441: on Solaris, localeconv() and strxfrm()
397 return characters outside the range U+0000-U+10FFFF. */
398 /* assert(maxchar <= 0x10FFFF); */
Victor Stinner77faf692011-11-20 18:56:05 +0100399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400401 return 1;
402}
Victor Stinner910337b2011-10-03 03:20:16 +0200403#endif
404
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100405static PyObject*
406unicode_result_wchar(PyObject *unicode)
407{
408#ifndef Py_DEBUG
409 Py_ssize_t len;
410
411 assert(Py_REFCNT(unicode) == 1);
412
413 len = _PyUnicode_WSTR_LENGTH(unicode);
414 if (len == 0) {
415 Py_INCREF(unicode_empty);
416 Py_DECREF(unicode);
417 return unicode_empty;
418 }
419
420 if (len == 1) {
421 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
422 if (ch < 256) {
423 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
424 Py_DECREF(unicode);
425 return latin1_char;
426 }
427 }
428
429 if (_PyUnicode_Ready(unicode) < 0) {
430 Py_XDECREF(unicode);
431 return NULL;
432 }
433#else
434 /* don't make the result ready in debug mode to ensure that the caller
435 makes the string ready before using it */
436 assert(_PyUnicode_CheckConsistency(unicode, 1));
437#endif
438 return unicode;
439}
440
441static PyObject*
442unicode_result_ready(PyObject *unicode)
443{
444 Py_ssize_t length;
445
446 length = PyUnicode_GET_LENGTH(unicode);
447 if (length == 0) {
448 if (unicode != unicode_empty) {
449 Py_INCREF(unicode_empty);
450 Py_DECREF(unicode);
451 }
452 return unicode_empty;
453 }
454
455 if (length == 1) {
456 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
457 if (ch < 256) {
458 PyObject *latin1_char = unicode_latin1[ch];
459 if (latin1_char != NULL) {
460 if (unicode != latin1_char) {
461 Py_INCREF(latin1_char);
462 Py_DECREF(unicode);
463 }
464 return latin1_char;
465 }
466 else {
467 assert(_PyUnicode_CheckConsistency(unicode, 1));
468 Py_INCREF(unicode);
469 unicode_latin1[ch] = unicode;
470 return unicode;
471 }
472 }
473 }
474
475 assert(_PyUnicode_CheckConsistency(unicode, 1));
476 return unicode;
477}
478
479static PyObject*
480unicode_result(PyObject *unicode)
481{
482 assert(_PyUnicode_CHECK(unicode));
483 if (PyUnicode_IS_READY(unicode))
484 return unicode_result_ready(unicode);
485 else
486 return unicode_result_wchar(unicode);
487}
488
Victor Stinner3a50e702011-10-18 21:21:00 +0200489#ifdef HAVE_MBCS
490static OSVERSIONINFOEX winver;
491#endif
492
Thomas Wouters477c8d52006-05-27 19:21:47 +0000493/* --- Bloom Filters ----------------------------------------------------- */
494
495/* stuff to implement simple "bloom filters" for Unicode characters.
496 to keep things simple, we use a single bitmask, using the least 5
497 bits from each unicode characters as the bit index. */
498
499/* the linebreak mask is set up by Unicode_Init below */
500
Antoine Pitrouf068f942010-01-13 14:19:12 +0000501#if LONG_BIT >= 128
502#define BLOOM_WIDTH 128
503#elif LONG_BIT >= 64
504#define BLOOM_WIDTH 64
505#elif LONG_BIT >= 32
506#define BLOOM_WIDTH 32
507#else
508#error "LONG_BIT is smaller than 32"
509#endif
510
Thomas Wouters477c8d52006-05-27 19:21:47 +0000511#define BLOOM_MASK unsigned long
512
513static BLOOM_MASK bloom_linebreak;
514
Antoine Pitrouf068f942010-01-13 14:19:12 +0000515#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
516#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517
Benjamin Peterson29060642009-01-31 22:14:21 +0000518#define BLOOM_LINEBREAK(ch) \
519 ((ch) < 128U ? ascii_linebreak[(ch)] : \
520 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000521
Alexander Belopolsky40018472011-02-26 01:02:56 +0000522Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200523make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524{
525 /* calculate simple bloom-style bitmask for a given unicode string */
526
Antoine Pitrouf068f942010-01-13 14:19:12 +0000527 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528 Py_ssize_t i;
529
530 mask = 0;
531 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200532 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000533
534 return mask;
535}
536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200537#define BLOOM_MEMBER(mask, chr, str) \
538 (BLOOM(mask, chr) \
539 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000540
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200541/* Compilation of templated routines */
542
543#include "stringlib/asciilib.h"
544#include "stringlib/fastsearch.h"
545#include "stringlib/partition.h"
546#include "stringlib/split.h"
547#include "stringlib/count.h"
548#include "stringlib/find.h"
549#include "stringlib/find_max_char.h"
550#include "stringlib/localeutil.h"
551#include "stringlib/undef.h"
552
553#include "stringlib/ucs1lib.h"
554#include "stringlib/fastsearch.h"
555#include "stringlib/partition.h"
556#include "stringlib/split.h"
557#include "stringlib/count.h"
558#include "stringlib/find.h"
559#include "stringlib/find_max_char.h"
560#include "stringlib/localeutil.h"
561#include "stringlib/undef.h"
562
563#include "stringlib/ucs2lib.h"
564#include "stringlib/fastsearch.h"
565#include "stringlib/partition.h"
566#include "stringlib/split.h"
567#include "stringlib/count.h"
568#include "stringlib/find.h"
569#include "stringlib/find_max_char.h"
570#include "stringlib/localeutil.h"
571#include "stringlib/undef.h"
572
573#include "stringlib/ucs4lib.h"
574#include "stringlib/fastsearch.h"
575#include "stringlib/partition.h"
576#include "stringlib/split.h"
577#include "stringlib/count.h"
578#include "stringlib/find.h"
579#include "stringlib/find_max_char.h"
580#include "stringlib/localeutil.h"
581#include "stringlib/undef.h"
582
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200583#include "stringlib/unicodedefs.h"
584#include "stringlib/fastsearch.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100587#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200588
Guido van Rossumd57fd912000-03-10 22:53:23 +0000589/* --- Unicode Object ----------------------------------------------------- */
590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200591static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200592fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200593
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200594Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
595 Py_ssize_t size, Py_UCS4 ch,
596 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200597{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200598 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
599
600 switch (kind) {
601 case PyUnicode_1BYTE_KIND:
602 {
603 Py_UCS1 ch1 = (Py_UCS1) ch;
604 if (ch1 == ch)
605 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
606 else
607 return -1;
608 }
609 case PyUnicode_2BYTE_KIND:
610 {
611 Py_UCS2 ch2 = (Py_UCS2) ch;
612 if (ch2 == ch)
613 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
614 else
615 return -1;
616 }
617 case PyUnicode_4BYTE_KIND:
618 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
619 default:
620 assert(0);
621 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200623}
624
Victor Stinnerfe226c02011-10-03 03:52:20 +0200625static PyObject*
626resize_compact(PyObject *unicode, Py_ssize_t length)
627{
628 Py_ssize_t char_size;
629 Py_ssize_t struct_size;
630 Py_ssize_t new_size;
631 int share_wstr;
632
633 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200634 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200635 if (PyUnicode_IS_COMPACT_ASCII(unicode))
636 struct_size = sizeof(PyASCIIObject);
637 else
638 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200639 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200640
641 _Py_DEC_REFTOTAL;
642 _Py_ForgetReference(unicode);
643
644 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
645 PyErr_NoMemory();
646 return NULL;
647 }
648 new_size = (struct_size + (length + 1) * char_size);
649
650 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
651 if (unicode == NULL) {
652 PyObject_Del(unicode);
653 PyErr_NoMemory();
654 return NULL;
655 }
656 _Py_NewReference(unicode);
657 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200658 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200660 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
661 _PyUnicode_WSTR_LENGTH(unicode) = length;
662 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
664 length, 0);
665 return unicode;
666}
667
Alexander Belopolsky40018472011-02-26 01:02:56 +0000668static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200669resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670{
Victor Stinner95663112011-10-04 01:03:50 +0200671 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200672 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200673 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000674
Victor Stinner95663112011-10-04 01:03:50 +0200675 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676
677 if (PyUnicode_IS_READY(unicode)) {
678 Py_ssize_t char_size;
679 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200680 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 void *data;
682
683 data = _PyUnicode_DATA_ANY(unicode);
684 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200685 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200686 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
687 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200688 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
689 {
690 PyObject_DEL(_PyUnicode_UTF8(unicode));
691 _PyUnicode_UTF8(unicode) = NULL;
692 _PyUnicode_UTF8_LENGTH(unicode) = 0;
693 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694
695 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
696 PyErr_NoMemory();
697 return -1;
698 }
699 new_size = (length + 1) * char_size;
700
701 data = (PyObject *)PyObject_REALLOC(data, new_size);
702 if (data == NULL) {
703 PyErr_NoMemory();
704 return -1;
705 }
706 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200707 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200708 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200709 _PyUnicode_WSTR_LENGTH(unicode) = length;
710 }
711 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200712 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200713 _PyUnicode_UTF8_LENGTH(unicode) = length;
714 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715 _PyUnicode_LENGTH(unicode) = length;
716 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200717 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200718 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 }
Victor Stinner95663112011-10-04 01:03:50 +0200722 assert(_PyUnicode_WSTR(unicode) != NULL);
723
724 /* check for integer overflow */
725 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
726 PyErr_NoMemory();
727 return -1;
728 }
729 wstr = _PyUnicode_WSTR(unicode);
730 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
731 if (!wstr) {
732 PyErr_NoMemory();
733 return -1;
734 }
735 _PyUnicode_WSTR(unicode) = wstr;
736 _PyUnicode_WSTR(unicode)[length] = 0;
737 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200738 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000739 return 0;
740}
741
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742static PyObject*
743resize_copy(PyObject *unicode, Py_ssize_t length)
744{
745 Py_ssize_t copy_length;
746 if (PyUnicode_IS_COMPACT(unicode)) {
747 PyObject *copy;
748 assert(PyUnicode_IS_READY(unicode));
749
750 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
751 if (copy == NULL)
752 return NULL;
753
754 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200755 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200756 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200757 }
758 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 assert(_PyUnicode_WSTR(unicode) != NULL);
761 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200762 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200763 if (w == NULL)
764 return NULL;
765 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
766 copy_length = Py_MIN(copy_length, length);
767 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
768 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200769 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200770 }
771}
772
Guido van Rossumd57fd912000-03-10 22:53:23 +0000773/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000774 Ux0000 terminated; some code (e.g. new_identifier)
775 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776
777 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000778 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000779
780*/
781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200782#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200783static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200784#endif
785
Alexander Belopolsky40018472011-02-26 01:02:56 +0000786static PyUnicodeObject *
787_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000788{
789 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200790 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791
Thomas Wouters477c8d52006-05-27 19:21:47 +0000792 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793 if (length == 0 && unicode_empty != NULL) {
794 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200795 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796 }
797
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000798 /* Ensure we won't overflow the size. */
799 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
800 return (PyUnicodeObject *)PyErr_NoMemory();
801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 if (length < 0) {
803 PyErr_SetString(PyExc_SystemError,
804 "Negative size passed to _PyUnicode_New");
805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806 }
807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808#ifdef Py_DEBUG
809 ++unicode_old_new_calls;
810#endif
811
812 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
813 if (unicode == NULL)
814 return NULL;
815 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
816 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
817 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000818 PyErr_NoMemory();
819 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200821
Jeremy Hyltond8082792003-09-16 19:41:39 +0000822 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000823 * the caller fails before initializing str -- unicode_resize()
824 * reads str[0], and the Keep-Alive optimization can keep memory
825 * allocated for str alive across a call to unicode_dealloc(unicode).
826 * We don't want unicode_resize to read uninitialized memory in
827 * that case.
828 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200829 _PyUnicode_WSTR(unicode)[0] = 0;
830 _PyUnicode_WSTR(unicode)[length] = 0;
831 _PyUnicode_WSTR_LENGTH(unicode) = length;
832 _PyUnicode_HASH(unicode) = -1;
833 _PyUnicode_STATE(unicode).interned = 0;
834 _PyUnicode_STATE(unicode).kind = 0;
835 _PyUnicode_STATE(unicode).compact = 0;
836 _PyUnicode_STATE(unicode).ready = 0;
837 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200838 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200839 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200840 _PyUnicode_UTF8(unicode) = NULL;
841 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100842 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000843 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000844
Benjamin Peterson29060642009-01-31 22:14:21 +0000845 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000846 /* XXX UNREF/NEWREF interface should be more symmetrical */
847 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000848 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000849 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000850 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000851}
852
Victor Stinnerf42dc442011-10-02 23:33:16 +0200853static const char*
854unicode_kind_name(PyObject *unicode)
855{
Victor Stinner42dfd712011-10-03 14:41:45 +0200856 /* don't check consistency: unicode_kind_name() is called from
857 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200858 if (!PyUnicode_IS_COMPACT(unicode))
859 {
860 if (!PyUnicode_IS_READY(unicode))
861 return "wstr";
862 switch(PyUnicode_KIND(unicode))
863 {
864 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200865 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200866 return "legacy ascii";
867 else
868 return "legacy latin1";
869 case PyUnicode_2BYTE_KIND:
870 return "legacy UCS2";
871 case PyUnicode_4BYTE_KIND:
872 return "legacy UCS4";
873 default:
874 return "<legacy invalid kind>";
875 }
876 }
877 assert(PyUnicode_IS_READY(unicode));
878 switch(PyUnicode_KIND(unicode))
879 {
880 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200881 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200882 return "ascii";
883 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200884 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200885 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200886 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200888 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200889 default:
890 return "<invalid compact kind>";
891 }
892}
893
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200895static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896
897/* Functions wrapping macros for use in debugger */
898char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200899 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900}
901
902void *_PyUnicode_compact_data(void *unicode) {
903 return _PyUnicode_COMPACT_DATA(unicode);
904}
905void *_PyUnicode_data(void *unicode){
906 printf("obj %p\n", unicode);
907 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
908 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
909 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
910 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
911 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
912 return PyUnicode_DATA(unicode);
913}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200914
915void
916_PyUnicode_Dump(PyObject *op)
917{
918 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200919 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
920 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
921 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200922
Victor Stinnera849a4b2011-10-03 12:12:11 +0200923 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200924 {
925 if (ascii->state.ascii)
926 data = (ascii + 1);
927 else
928 data = (compact + 1);
929 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200930 else
931 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200932 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
933
Victor Stinnera849a4b2011-10-03 12:12:11 +0200934 if (ascii->wstr == data)
935 printf("shared ");
936 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200937
Victor Stinnera3b334d2011-10-03 13:53:37 +0200938 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200939 printf(" (%zu), ", compact->wstr_length);
940 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
941 printf("shared ");
942 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200943 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200945}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200946#endif
947
948PyObject *
949PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
950{
951 PyObject *obj;
952 PyCompactUnicodeObject *unicode;
953 void *data;
954 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200955 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200956 Py_ssize_t char_size;
957 Py_ssize_t struct_size;
958
959 /* Optimization for empty strings */
960 if (size == 0 && unicode_empty != NULL) {
961 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200962 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963 }
964
965#ifdef Py_DEBUG
966 ++unicode_new_new_calls;
967#endif
968
Victor Stinner9e9d6892011-10-04 01:02:02 +0200969 is_ascii = 0;
970 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971 struct_size = sizeof(PyCompactUnicodeObject);
972 if (maxchar < 128) {
973 kind_state = PyUnicode_1BYTE_KIND;
974 char_size = 1;
975 is_ascii = 1;
976 struct_size = sizeof(PyASCIIObject);
977 }
978 else if (maxchar < 256) {
979 kind_state = PyUnicode_1BYTE_KIND;
980 char_size = 1;
981 }
982 else if (maxchar < 65536) {
983 kind_state = PyUnicode_2BYTE_KIND;
984 char_size = 2;
985 if (sizeof(wchar_t) == 2)
986 is_sharing = 1;
987 }
988 else {
989 kind_state = PyUnicode_4BYTE_KIND;
990 char_size = 4;
991 if (sizeof(wchar_t) == 4)
992 is_sharing = 1;
993 }
994
995 /* Ensure we won't overflow the size. */
996 if (size < 0) {
997 PyErr_SetString(PyExc_SystemError,
998 "Negative size passed to PyUnicode_New");
999 return NULL;
1000 }
1001 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1002 return PyErr_NoMemory();
1003
1004 /* Duplicated allocation code from _PyObject_New() instead of a call to
1005 * PyObject_New() so we are able to allocate space for the object and
1006 * it's data buffer.
1007 */
1008 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1009 if (obj == NULL)
1010 return PyErr_NoMemory();
1011 obj = PyObject_INIT(obj, &PyUnicode_Type);
1012 if (obj == NULL)
1013 return NULL;
1014
1015 unicode = (PyCompactUnicodeObject *)obj;
1016 if (is_ascii)
1017 data = ((PyASCIIObject*)obj) + 1;
1018 else
1019 data = unicode + 1;
1020 _PyUnicode_LENGTH(unicode) = size;
1021 _PyUnicode_HASH(unicode) = -1;
1022 _PyUnicode_STATE(unicode).interned = 0;
1023 _PyUnicode_STATE(unicode).kind = kind_state;
1024 _PyUnicode_STATE(unicode).compact = 1;
1025 _PyUnicode_STATE(unicode).ready = 1;
1026 _PyUnicode_STATE(unicode).ascii = is_ascii;
1027 if (is_ascii) {
1028 ((char*)data)[size] = 0;
1029 _PyUnicode_WSTR(unicode) = NULL;
1030 }
1031 else if (kind_state == PyUnicode_1BYTE_KIND) {
1032 ((char*)data)[size] = 0;
1033 _PyUnicode_WSTR(unicode) = NULL;
1034 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001035 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001036 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001037 }
1038 else {
1039 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001040 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001041 if (kind_state == PyUnicode_2BYTE_KIND)
1042 ((Py_UCS2*)data)[size] = 0;
1043 else /* kind_state == PyUnicode_4BYTE_KIND */
1044 ((Py_UCS4*)data)[size] = 0;
1045 if (is_sharing) {
1046 _PyUnicode_WSTR_LENGTH(unicode) = size;
1047 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1048 }
1049 else {
1050 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1051 _PyUnicode_WSTR(unicode) = NULL;
1052 }
1053 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001054 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 return obj;
1056}
1057
1058#if SIZEOF_WCHAR_T == 2
1059/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1060 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001061 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062
1063 This function assumes that unicode can hold one more code point than wstr
1064 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001065static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001067 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068{
1069 const wchar_t *iter;
1070 Py_UCS4 *ucs4_out;
1071
Victor Stinner910337b2011-10-03 03:20:16 +02001072 assert(unicode != NULL);
1073 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001074 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1075 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1076
1077 for (iter = begin; iter < end; ) {
1078 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1079 _PyUnicode_GET_LENGTH(unicode)));
1080 if (*iter >= 0xD800 && *iter <= 0xDBFF
1081 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1082 {
1083 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1084 iter += 2;
1085 }
1086 else {
1087 *ucs4_out++ = *iter;
1088 iter++;
1089 }
1090 }
1091 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1092 _PyUnicode_GET_LENGTH(unicode)));
1093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094}
1095#endif
1096
Victor Stinnercd9950f2011-10-02 00:34:53 +02001097static int
1098_PyUnicode_Dirty(PyObject *unicode)
1099{
Victor Stinner910337b2011-10-03 03:20:16 +02001100 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001101 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001102 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001103 "Cannot modify a string having more than 1 reference");
1104 return -1;
1105 }
1106 _PyUnicode_DIRTY(unicode);
1107 return 0;
1108}
1109
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001110static int
1111_copy_characters(PyObject *to, Py_ssize_t to_start,
1112 PyObject *from, Py_ssize_t from_start,
1113 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001114{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001115 unsigned int from_kind, to_kind;
1116 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001117 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001119 assert(PyUnicode_Check(from));
1120 assert(PyUnicode_Check(to));
1121 assert(PyUnicode_IS_READY(from));
1122 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001124 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1125 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1126 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001128 if (how_many == 0)
1129 return 0;
1130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001132 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001134 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001135
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001136#ifdef Py_DEBUG
1137 if (!check_maxchar
1138 && (from_kind > to_kind
1139 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001140 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001141 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1142 Py_UCS4 ch;
1143 Py_ssize_t i;
1144 for (i=0; i < how_many; i++) {
1145 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1146 assert(ch <= to_maxchar);
1147 }
1148 }
1149#endif
1150 fast = (from_kind == to_kind);
1151 if (check_maxchar
1152 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1153 {
1154 /* deny latin1 => ascii */
1155 fast = 0;
1156 }
1157
1158 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001159 Py_MEMCPY((char*)to_data + to_kind * to_start,
1160 (char*)from_data + from_kind * from_start,
1161 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001163 else if (from_kind == PyUnicode_1BYTE_KIND
1164 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001165 {
1166 _PyUnicode_CONVERT_BYTES(
1167 Py_UCS1, Py_UCS2,
1168 PyUnicode_1BYTE_DATA(from) + from_start,
1169 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1170 PyUnicode_2BYTE_DATA(to) + to_start
1171 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001172 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001173 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001174 && to_kind == PyUnicode_4BYTE_KIND)
1175 {
1176 _PyUnicode_CONVERT_BYTES(
1177 Py_UCS1, Py_UCS4,
1178 PyUnicode_1BYTE_DATA(from) + from_start,
1179 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1180 PyUnicode_4BYTE_DATA(to) + to_start
1181 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001182 }
1183 else if (from_kind == PyUnicode_2BYTE_KIND
1184 && to_kind == PyUnicode_4BYTE_KIND)
1185 {
1186 _PyUnicode_CONVERT_BYTES(
1187 Py_UCS2, Py_UCS4,
1188 PyUnicode_2BYTE_DATA(from) + from_start,
1189 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1190 PyUnicode_4BYTE_DATA(to) + to_start
1191 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001192 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001193 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001194 /* check if max_char(from substring) <= max_char(to) */
1195 if (from_kind > to_kind
1196 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001197 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001198 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 /* slow path to check for character overflow */
1200 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001201 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001202 Py_ssize_t i;
1203
Victor Stinner56c161a2011-10-06 02:47:11 +02001204#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001205 for (i=0; i < how_many; i++) {
1206 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001207 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001208 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1209 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001210#else
1211 if (!check_maxchar) {
1212 for (i=0; i < how_many; i++) {
1213 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1214 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1215 }
1216 }
1217 else {
1218 for (i=0; i < how_many; i++) {
1219 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1220 if (ch > to_maxchar)
1221 return 1;
1222 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1223 }
1224 }
1225#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001226 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001227 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001228 assert(0 && "inconsistent state");
1229 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001230 }
1231 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001232 return 0;
1233}
1234
1235static void
1236copy_characters(PyObject *to, Py_ssize_t to_start,
1237 PyObject *from, Py_ssize_t from_start,
1238 Py_ssize_t how_many)
1239{
1240 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1241}
1242
1243Py_ssize_t
1244PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1245 PyObject *from, Py_ssize_t from_start,
1246 Py_ssize_t how_many)
1247{
1248 int err;
1249
1250 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1251 PyErr_BadInternalCall();
1252 return -1;
1253 }
1254
1255 if (PyUnicode_READY(from))
1256 return -1;
1257 if (PyUnicode_READY(to))
1258 return -1;
1259
1260 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1261 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1262 PyErr_Format(PyExc_SystemError,
1263 "Cannot write %zi characters at %zi "
1264 "in a string of %zi characters",
1265 how_many, to_start, PyUnicode_GET_LENGTH(to));
1266 return -1;
1267 }
1268
1269 if (how_many == 0)
1270 return 0;
1271
1272 if (_PyUnicode_Dirty(to))
1273 return -1;
1274
1275 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1276 if (err) {
1277 PyErr_Format(PyExc_SystemError,
1278 "Cannot copy %s characters "
1279 "into a string of %s characters",
1280 unicode_kind_name(from),
1281 unicode_kind_name(to));
1282 return -1;
1283 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001284 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001285}
1286
Victor Stinner17222162011-09-28 22:15:37 +02001287/* Find the maximum code point and count the number of surrogate pairs so a
1288 correct string length can be computed before converting a string to UCS4.
1289 This function counts single surrogates as a character and not as a pair.
1290
1291 Return 0 on success, or -1 on error. */
1292static int
1293find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1294 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001295{
1296 const wchar_t *iter;
1297
Victor Stinnerc53be962011-10-02 21:33:54 +02001298 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001299 *num_surrogates = 0;
1300 *maxchar = 0;
1301
1302 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001303 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001305#if SIZEOF_WCHAR_T != 2
1306 if (*maxchar >= 0x10000)
1307 return 0;
1308#endif
1309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310#if SIZEOF_WCHAR_T == 2
1311 if (*iter >= 0xD800 && *iter <= 0xDBFF
1312 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1313 {
1314 Py_UCS4 surrogate_val;
1315 surrogate_val = (((iter[0] & 0x3FF)<<10)
1316 | (iter[1] & 0x3FF)) + 0x10000;
1317 ++(*num_surrogates);
1318 if (surrogate_val > *maxchar)
1319 *maxchar = surrogate_val;
1320 iter += 2;
1321 }
1322 else
1323 iter++;
1324#else
1325 iter++;
1326#endif
1327 }
1328 return 0;
1329}
1330
1331#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001332static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333#endif
1334
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001335int
1336_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337{
1338 wchar_t *end;
1339 Py_UCS4 maxchar = 0;
1340 Py_ssize_t num_surrogates;
1341#if SIZEOF_WCHAR_T == 2
1342 Py_ssize_t length_wo_surrogates;
1343#endif
1344
Georg Brandl7597add2011-10-05 16:36:47 +02001345 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001346 strings were created using _PyObject_New() and where no canonical
1347 representation (the str field) has been set yet aka strings
1348 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001349 assert(_PyUnicode_CHECK(unicode));
1350 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001352 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001353 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001354 /* Actually, it should neither be interned nor be anything else: */
1355 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356
1357#ifdef Py_DEBUG
1358 ++unicode_ready_calls;
1359#endif
1360
1361 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001362 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001363 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365
1366 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001367 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1368 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 PyErr_NoMemory();
1370 return -1;
1371 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001372 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373 _PyUnicode_WSTR(unicode), end,
1374 PyUnicode_1BYTE_DATA(unicode));
1375 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1376 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1377 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1378 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001379 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001380 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001381 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 }
1383 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001384 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001385 _PyUnicode_UTF8(unicode) = NULL;
1386 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 }
1388 PyObject_FREE(_PyUnicode_WSTR(unicode));
1389 _PyUnicode_WSTR(unicode) = NULL;
1390 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1391 }
1392 /* In this case we might have to convert down from 4-byte native
1393 wchar_t to 2-byte unicode. */
1394 else if (maxchar < 65536) {
1395 assert(num_surrogates == 0 &&
1396 "FindMaxCharAndNumSurrogatePairs() messed up");
1397
Victor Stinner506f5922011-09-28 22:34:18 +02001398#if SIZEOF_WCHAR_T == 2
1399 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001400 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001401 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1402 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1403 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001404 _PyUnicode_UTF8(unicode) = NULL;
1405 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001406#else
1407 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001408 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001409 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001410 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001411 PyErr_NoMemory();
1412 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 }
Victor Stinner506f5922011-09-28 22:34:18 +02001414 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1415 _PyUnicode_WSTR(unicode), end,
1416 PyUnicode_2BYTE_DATA(unicode));
1417 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1418 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1419 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001420 _PyUnicode_UTF8(unicode) = NULL;
1421 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001422 PyObject_FREE(_PyUnicode_WSTR(unicode));
1423 _PyUnicode_WSTR(unicode) = NULL;
1424 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1425#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 }
1427 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1428 else {
1429#if SIZEOF_WCHAR_T == 2
1430 /* in case the native representation is 2-bytes, we need to allocate a
1431 new normalized 4-byte version. */
1432 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001433 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1434 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001435 PyErr_NoMemory();
1436 return -1;
1437 }
1438 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1439 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001440 _PyUnicode_UTF8(unicode) = NULL;
1441 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001442 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1443 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001444 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 PyObject_FREE(_PyUnicode_WSTR(unicode));
1446 _PyUnicode_WSTR(unicode) = NULL;
1447 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1448#else
1449 assert(num_surrogates == 0);
1450
Victor Stinnerc3c74152011-10-02 20:39:55 +02001451 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001453 _PyUnicode_UTF8(unicode) = NULL;
1454 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1456#endif
1457 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1458 }
1459 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001460 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461 return 0;
1462}
1463
Alexander Belopolsky40018472011-02-26 01:02:56 +00001464static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001465unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466{
Walter Dörwald16807132007-05-25 13:52:07 +00001467 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001468 case SSTATE_NOT_INTERNED:
1469 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001470
Benjamin Peterson29060642009-01-31 22:14:21 +00001471 case SSTATE_INTERNED_MORTAL:
1472 /* revive dead object temporarily for DelItem */
1473 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001474 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 Py_FatalError(
1476 "deletion of interned string failed");
1477 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001478
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 case SSTATE_INTERNED_IMMORTAL:
1480 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001481
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 default:
1483 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001484 }
1485
Victor Stinner03490912011-10-03 23:45:12 +02001486 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001488 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001489 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490
1491 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001492 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001493 }
1494 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001495 if (_PyUnicode_DATA_ANY(unicode))
1496 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001497 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 }
1499}
1500
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001501#ifdef Py_DEBUG
1502static int
1503unicode_is_singleton(PyObject *unicode)
1504{
1505 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1506 if (unicode == unicode_empty)
1507 return 1;
1508 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1509 {
1510 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1511 if (ch < 256 && unicode_latin1[ch] == unicode)
1512 return 1;
1513 }
1514 return 0;
1515}
1516#endif
1517
Alexander Belopolsky40018472011-02-26 01:02:56 +00001518static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001519unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001520{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001521 if (Py_REFCNT(unicode) != 1)
1522 return 0;
1523 if (PyUnicode_CHECK_INTERNED(unicode))
1524 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001525#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001526 /* singleton refcount is greater than 1 */
1527 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001528#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001529 return 1;
1530}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001531
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532static int
1533unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1534{
1535 PyObject *unicode;
1536 Py_ssize_t old_length;
1537
1538 assert(p_unicode != NULL);
1539 unicode = *p_unicode;
1540
1541 assert(unicode != NULL);
1542 assert(PyUnicode_Check(unicode));
1543 assert(0 <= length);
1544
Victor Stinner910337b2011-10-03 03:20:16 +02001545 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001546 old_length = PyUnicode_WSTR_LENGTH(unicode);
1547 else
1548 old_length = PyUnicode_GET_LENGTH(unicode);
1549 if (old_length == length)
1550 return 0;
1551
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001552 if (length == 0) {
1553 Py_DECREF(*p_unicode);
1554 *p_unicode = unicode_empty;
1555 Py_INCREF(*p_unicode);
1556 return 0;
1557 }
1558
Victor Stinnerfe226c02011-10-03 03:52:20 +02001559 if (!unicode_resizable(unicode)) {
1560 PyObject *copy = resize_copy(unicode, length);
1561 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001562 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001563 Py_DECREF(*p_unicode);
1564 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001565 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001566 }
1567
Victor Stinnerfe226c02011-10-03 03:52:20 +02001568 if (PyUnicode_IS_COMPACT(unicode)) {
1569 *p_unicode = resize_compact(unicode, length);
1570 if (*p_unicode == NULL)
1571 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001572 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001573 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001574 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001575 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001576}
1577
Alexander Belopolsky40018472011-02-26 01:02:56 +00001578int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001580{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 PyObject *unicode;
1582 if (p_unicode == NULL) {
1583 PyErr_BadInternalCall();
1584 return -1;
1585 }
1586 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001587 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 {
1589 PyErr_BadInternalCall();
1590 return -1;
1591 }
1592 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001593}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001594
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001595static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001596unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001597{
1598 PyObject *result;
1599 assert(PyUnicode_IS_READY(*p_unicode));
1600 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1601 return 0;
1602 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1603 maxchar);
1604 if (result == NULL)
1605 return -1;
1606 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1607 PyUnicode_GET_LENGTH(*p_unicode));
1608 Py_DECREF(*p_unicode);
1609 *p_unicode = result;
1610 return 0;
1611}
1612
1613static int
1614unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1615 Py_UCS4 ch)
1616{
1617 if (unicode_widen(p_unicode, ch) < 0)
1618 return -1;
1619 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1620 PyUnicode_DATA(*p_unicode),
1621 (*pos)++, ch);
1622 return 0;
1623}
1624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001625static PyObject*
1626get_latin1_char(unsigned char ch)
1627{
Victor Stinnera464fc12011-10-02 20:39:30 +02001628 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001630 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001631 if (!unicode)
1632 return NULL;
1633 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001634 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001635 unicode_latin1[ch] = unicode;
1636 }
1637 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001638 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001639}
1640
Alexander Belopolsky40018472011-02-26 01:02:56 +00001641PyObject *
1642PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001644 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001645 Py_UCS4 maxchar = 0;
1646 Py_ssize_t num_surrogates;
1647
1648 if (u == NULL)
1649 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001651 /* If the Unicode data is known at construction time, we can apply
1652 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001654 /* Optimization for empty strings */
1655 if (size == 0 && unicode_empty != NULL) {
1656 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001657 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001658 }
Tim Petersced69f82003-09-16 20:30:58 +00001659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 /* Single character Unicode objects in the Latin-1 range are
1661 shared when using this constructor */
1662 if (size == 1 && *u < 256)
1663 return get_latin1_char((unsigned char)*u);
1664
1665 /* If not empty and not single character, copy the Unicode data
1666 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001667 if (find_maxchar_surrogates(u, u + size,
1668 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001669 return NULL;
1670
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001671 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 if (!unicode)
1674 return NULL;
1675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 switch (PyUnicode_KIND(unicode)) {
1677 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001678 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1680 break;
1681 case PyUnicode_2BYTE_KIND:
1682#if Py_UNICODE_SIZE == 2
1683 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1684#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001685 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001686 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1687#endif
1688 break;
1689 case PyUnicode_4BYTE_KIND:
1690#if SIZEOF_WCHAR_T == 2
1691 /* This is the only case which has to process surrogates, thus
1692 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001693 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001694#else
1695 assert(num_surrogates == 0);
1696 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1697#endif
1698 break;
1699 default:
1700 assert(0 && "Impossible state");
1701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001703 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704}
1705
Alexander Belopolsky40018472011-02-26 01:02:56 +00001706PyObject *
1707PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001708{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001709 if (size < 0) {
1710 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001711 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001712 return NULL;
1713 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001714
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001715 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001716 some optimizations which share commonly used objects.
1717 Also, this means the input must be UTF-8, so fall back to the
1718 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001719 if (u != NULL) {
1720
Benjamin Peterson29060642009-01-31 22:14:21 +00001721 /* Optimization for empty strings */
1722 if (size == 0 && unicode_empty != NULL) {
1723 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001724 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001725 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001726
1727 /* Single characters are shared when using this constructor.
1728 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001729 if (size == 1 && (unsigned char)*u < 128)
1730 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001731
1732 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001733 }
1734
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001735 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001736}
1737
Alexander Belopolsky40018472011-02-26 01:02:56 +00001738PyObject *
1739PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001740{
1741 size_t size = strlen(u);
1742 if (size > PY_SSIZE_T_MAX) {
1743 PyErr_SetString(PyExc_OverflowError, "input too long");
1744 return NULL;
1745 }
1746
1747 return PyUnicode_FromStringAndSize(u, size);
1748}
1749
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001750PyObject *
1751_PyUnicode_FromId(_Py_Identifier *id)
1752{
1753 if (!id->object) {
1754 id->object = PyUnicode_FromString(id->string);
1755 if (!id->object)
1756 return NULL;
1757 PyUnicode_InternInPlace(&id->object);
1758 assert(!id->next);
1759 id->next = static_strings;
1760 static_strings = id;
1761 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001762 return id->object;
1763}
1764
1765void
1766_PyUnicode_ClearStaticStrings()
1767{
1768 _Py_Identifier *i;
1769 for (i = static_strings; i; i = i->next) {
1770 Py_DECREF(i->object);
1771 i->object = NULL;
1772 i->next = NULL;
1773 }
1774}
1775
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001776/* Internal function, don't check maximum character */
1777
Victor Stinnere57b1c02011-09-28 22:20:48 +02001778static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001779unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001780{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001781 PyObject *res;
1782#ifdef Py_DEBUG
1783 const unsigned char *p;
1784 const unsigned char *end = s + size;
1785 for (p=s; p < end; p++) {
1786 assert(*p < 128);
1787 }
1788#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001789 if (size == 1)
1790 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001791 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001792 if (!res)
1793 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001794 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001795 return res;
1796}
1797
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001798static Py_UCS4
1799kind_maxchar_limit(unsigned int kind)
1800{
1801 switch(kind) {
1802 case PyUnicode_1BYTE_KIND:
1803 return 0x80;
1804 case PyUnicode_2BYTE_KIND:
1805 return 0x100;
1806 case PyUnicode_4BYTE_KIND:
1807 return 0x10000;
1808 default:
1809 assert(0 && "invalid kind");
1810 return 0x10ffff;
1811 }
1812}
1813
Victor Stinner702c7342011-10-05 13:50:52 +02001814static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001815_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001818 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001819
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001820 if (size == 0) {
1821 Py_INCREF(unicode_empty);
1822 return unicode_empty;
1823 }
1824 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001825 if (size == 1)
1826 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001827
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001828 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001829 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 if (!res)
1831 return NULL;
1832 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001833 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001835}
1836
Victor Stinnere57b1c02011-09-28 22:20:48 +02001837static PyObject*
1838_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839{
1840 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001841 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001842
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843 if (size == 0) {
1844 Py_INCREF(unicode_empty);
1845 return unicode_empty;
1846 }
1847 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001848 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001849 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001850
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001851 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001852 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001853 if (!res)
1854 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001855 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001856 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001857 else {
1858 _PyUnicode_CONVERT_BYTES(
1859 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1860 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001861 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 return res;
1863}
1864
Victor Stinnere57b1c02011-09-28 22:20:48 +02001865static PyObject*
1866_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867{
1868 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001869 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001870
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001871 if (size == 0) {
1872 Py_INCREF(unicode_empty);
1873 return unicode_empty;
1874 }
1875 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001876 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001877 return get_latin1_char((unsigned char)u[0]);
1878
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001879 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001880 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881 if (!res)
1882 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001883 if (max_char < 256)
1884 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1885 PyUnicode_1BYTE_DATA(res));
1886 else if (max_char < 0x10000)
1887 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1888 PyUnicode_2BYTE_DATA(res));
1889 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001891 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 return res;
1893}
1894
1895PyObject*
1896PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1897{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001898 if (size < 0) {
1899 PyErr_SetString(PyExc_ValueError, "size must be positive");
1900 return NULL;
1901 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001902 switch(kind) {
1903 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001904 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001905 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001906 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001908 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001909 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001910 PyErr_SetString(PyExc_SystemError, "invalid kind");
1911 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913}
1914
Victor Stinner25a4b292011-10-06 12:31:55 +02001915/* Ensure that a string uses the most efficient storage, if it is not the
1916 case: create a new string with of the right kind. Write NULL into *p_unicode
1917 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001918static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001919unicode_adjust_maxchar(PyObject **p_unicode)
1920{
1921 PyObject *unicode, *copy;
1922 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001923 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001924 unsigned int kind;
1925
1926 assert(p_unicode != NULL);
1927 unicode = *p_unicode;
1928 assert(PyUnicode_IS_READY(unicode));
1929 if (PyUnicode_IS_ASCII(unicode))
1930 return;
1931
1932 len = PyUnicode_GET_LENGTH(unicode);
1933 kind = PyUnicode_KIND(unicode);
1934 if (kind == PyUnicode_1BYTE_KIND) {
1935 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001936 max_char = ucs1lib_find_max_char(u, u + len);
1937 if (max_char >= 128)
1938 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001939 }
1940 else if (kind == PyUnicode_2BYTE_KIND) {
1941 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001942 max_char = ucs2lib_find_max_char(u, u + len);
1943 if (max_char >= 256)
1944 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001945 }
1946 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001947 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001948 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001949 max_char = ucs4lib_find_max_char(u, u + len);
1950 if (max_char >= 0x10000)
1951 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001952 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001953 copy = PyUnicode_New(len, max_char);
1954 copy_characters(copy, 0, unicode, 0, len);
1955 Py_DECREF(unicode);
1956 *p_unicode = copy;
1957}
1958
Victor Stinner034f6cf2011-09-30 02:26:44 +02001959PyObject*
1960PyUnicode_Copy(PyObject *unicode)
1961{
Victor Stinner87af4f22011-11-21 23:03:47 +01001962 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001963 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001964
Victor Stinner034f6cf2011-09-30 02:26:44 +02001965 if (!PyUnicode_Check(unicode)) {
1966 PyErr_BadInternalCall();
1967 return NULL;
1968 }
1969 if (PyUnicode_READY(unicode))
1970 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001971
Victor Stinner87af4f22011-11-21 23:03:47 +01001972 length = PyUnicode_GET_LENGTH(unicode);
1973 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 if (!copy)
1975 return NULL;
1976 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1977
Victor Stinner87af4f22011-11-21 23:03:47 +01001978 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1979 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001981 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001982}
1983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984
Victor Stinnerbc603d12011-10-02 01:00:40 +02001985/* Widen Unicode objects to larger buffers. Don't write terminating null
1986 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987
1988void*
1989_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1990{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001991 Py_ssize_t len;
1992 void *result;
1993 unsigned int skind;
1994
1995 if (PyUnicode_READY(s))
1996 return NULL;
1997
1998 len = PyUnicode_GET_LENGTH(s);
1999 skind = PyUnicode_KIND(s);
2000 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002001 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 return NULL;
2003 }
2004 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002005 case PyUnicode_2BYTE_KIND:
2006 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2007 if (!result)
2008 return PyErr_NoMemory();
2009 assert(skind == PyUnicode_1BYTE_KIND);
2010 _PyUnicode_CONVERT_BYTES(
2011 Py_UCS1, Py_UCS2,
2012 PyUnicode_1BYTE_DATA(s),
2013 PyUnicode_1BYTE_DATA(s) + len,
2014 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002015 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002016 case PyUnicode_4BYTE_KIND:
2017 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2018 if (!result)
2019 return PyErr_NoMemory();
2020 if (skind == PyUnicode_2BYTE_KIND) {
2021 _PyUnicode_CONVERT_BYTES(
2022 Py_UCS2, Py_UCS4,
2023 PyUnicode_2BYTE_DATA(s),
2024 PyUnicode_2BYTE_DATA(s) + len,
2025 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002027 else {
2028 assert(skind == PyUnicode_1BYTE_KIND);
2029 _PyUnicode_CONVERT_BYTES(
2030 Py_UCS1, Py_UCS4,
2031 PyUnicode_1BYTE_DATA(s),
2032 PyUnicode_1BYTE_DATA(s) + len,
2033 result);
2034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002036 default:
2037 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 }
Victor Stinner01698042011-10-04 00:04:26 +02002039 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 return NULL;
2041}
2042
2043static Py_UCS4*
2044as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2045 int copy_null)
2046{
2047 int kind;
2048 void *data;
2049 Py_ssize_t len, targetlen;
2050 if (PyUnicode_READY(string) == -1)
2051 return NULL;
2052 kind = PyUnicode_KIND(string);
2053 data = PyUnicode_DATA(string);
2054 len = PyUnicode_GET_LENGTH(string);
2055 targetlen = len;
2056 if (copy_null)
2057 targetlen++;
2058 if (!target) {
2059 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2060 PyErr_NoMemory();
2061 return NULL;
2062 }
2063 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2064 if (!target) {
2065 PyErr_NoMemory();
2066 return NULL;
2067 }
2068 }
2069 else {
2070 if (targetsize < targetlen) {
2071 PyErr_Format(PyExc_SystemError,
2072 "string is longer than the buffer");
2073 if (copy_null && 0 < targetsize)
2074 target[0] = 0;
2075 return NULL;
2076 }
2077 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002078 if (kind == PyUnicode_1BYTE_KIND) {
2079 Py_UCS1 *start = (Py_UCS1 *) data;
2080 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002082 else if (kind == PyUnicode_2BYTE_KIND) {
2083 Py_UCS2 *start = (Py_UCS2 *) data;
2084 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2085 }
2086 else {
2087 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002088 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002090 if (copy_null)
2091 target[len] = 0;
2092 return target;
2093}
2094
2095Py_UCS4*
2096PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2097 int copy_null)
2098{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002099 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100 PyErr_BadInternalCall();
2101 return NULL;
2102 }
2103 return as_ucs4(string, target, targetsize, copy_null);
2104}
2105
2106Py_UCS4*
2107PyUnicode_AsUCS4Copy(PyObject *string)
2108{
2109 return as_ucs4(string, NULL, 0, 1);
2110}
2111
2112#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002113
Alexander Belopolsky40018472011-02-26 01:02:56 +00002114PyObject *
2115PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002118 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002120 PyErr_BadInternalCall();
2121 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 }
2123
Martin v. Löwis790465f2008-04-05 20:41:37 +00002124 if (size == -1) {
2125 size = wcslen(w);
2126 }
2127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002128 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129}
2130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002132
Walter Dörwald346737f2007-05-31 10:44:43 +00002133static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002134makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2135 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002136{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002137 *fmt++ = '%';
2138 if (width) {
2139 if (zeropad)
2140 *fmt++ = '0';
2141 fmt += sprintf(fmt, "%d", width);
2142 }
2143 if (precision)
2144 fmt += sprintf(fmt, ".%d", precision);
2145 if (longflag)
2146 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002147 else if (longlongflag) {
2148 /* longlongflag should only ever be nonzero on machines with
2149 HAVE_LONG_LONG defined */
2150#ifdef HAVE_LONG_LONG
2151 char *f = PY_FORMAT_LONG_LONG;
2152 while (*f)
2153 *fmt++ = *f++;
2154#else
2155 /* we shouldn't ever get here */
2156 assert(0);
2157 *fmt++ = 'l';
2158#endif
2159 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002160 else if (size_tflag) {
2161 char *f = PY_FORMAT_SIZE_T;
2162 while (*f)
2163 *fmt++ = *f++;
2164 }
2165 *fmt++ = c;
2166 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002167}
2168
Victor Stinner96865452011-03-01 23:44:09 +00002169/* helper for PyUnicode_FromFormatV() */
2170
2171static const char*
2172parse_format_flags(const char *f,
2173 int *p_width, int *p_precision,
2174 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2175{
2176 int width, precision, longflag, longlongflag, size_tflag;
2177
2178 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2179 f++;
2180 width = 0;
2181 while (Py_ISDIGIT((unsigned)*f))
2182 width = (width*10) + *f++ - '0';
2183 precision = 0;
2184 if (*f == '.') {
2185 f++;
2186 while (Py_ISDIGIT((unsigned)*f))
2187 precision = (precision*10) + *f++ - '0';
2188 if (*f == '%') {
2189 /* "%.3%s" => f points to "3" */
2190 f--;
2191 }
2192 }
2193 if (*f == '\0') {
2194 /* bogus format "%.1" => go backward, f points to "1" */
2195 f--;
2196 }
2197 if (p_width != NULL)
2198 *p_width = width;
2199 if (p_precision != NULL)
2200 *p_precision = precision;
2201
2202 /* Handle %ld, %lu, %lld and %llu. */
2203 longflag = 0;
2204 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002205 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002206
2207 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002208 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002209 longflag = 1;
2210 ++f;
2211 }
2212#ifdef HAVE_LONG_LONG
2213 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002214 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002215 longlongflag = 1;
2216 f += 2;
2217 }
2218#endif
2219 }
2220 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002221 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002222 size_tflag = 1;
2223 ++f;
2224 }
2225 if (p_longflag != NULL)
2226 *p_longflag = longflag;
2227 if (p_longlongflag != NULL)
2228 *p_longlongflag = longlongflag;
2229 if (p_size_tflag != NULL)
2230 *p_size_tflag = size_tflag;
2231 return f;
2232}
2233
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002234/* maximum number of characters required for output of %ld. 21 characters
2235 allows for 64-bit integers (in decimal) and an optional sign. */
2236#define MAX_LONG_CHARS 21
2237/* maximum number of characters required for output of %lld.
2238 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2239 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2240#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2241
Walter Dörwaldd2034312007-05-18 16:29:38 +00002242PyObject *
2243PyUnicode_FromFormatV(const char *format, va_list vargs)
2244{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002245 va_list count;
2246 Py_ssize_t callcount = 0;
2247 PyObject **callresults = NULL;
2248 PyObject **callresult = NULL;
2249 Py_ssize_t n = 0;
2250 int width = 0;
2251 int precision = 0;
2252 int zeropad;
2253 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002254 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002255 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002256 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2258 Py_UCS4 argmaxchar;
2259 Py_ssize_t numbersize = 0;
2260 char *numberresults = NULL;
2261 char *numberresult = NULL;
2262 Py_ssize_t i;
2263 int kind;
2264 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002265
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002266 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002267 /* step 1: count the number of %S/%R/%A/%s format specifications
2268 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2269 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002271 * also estimate a upper bound for all the number formats in the string,
2272 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002274 for (f = format; *f; f++) {
2275 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002276 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2278 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2279 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2280 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002283#ifdef HAVE_LONG_LONG
2284 if (longlongflag) {
2285 if (width < MAX_LONG_LONG_CHARS)
2286 width = MAX_LONG_LONG_CHARS;
2287 }
2288 else
2289#endif
2290 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2291 including sign. Decimal takes the most space. This
2292 isn't enough for octal. If a width is specified we
2293 need more (which we allocate later). */
2294 if (width < MAX_LONG_CHARS)
2295 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002296
2297 /* account for the size + '\0' to separate numbers
2298 inside of the numberresults buffer */
2299 numbersize += (width + 1);
2300 }
2301 }
2302 else if ((unsigned char)*f > 127) {
2303 PyErr_Format(PyExc_ValueError,
2304 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2305 "string, got a non-ASCII byte: 0x%02x",
2306 (unsigned char)*f);
2307 return NULL;
2308 }
2309 }
2310 /* step 2: allocate memory for the results of
2311 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2312 if (callcount) {
2313 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2314 if (!callresults) {
2315 PyErr_NoMemory();
2316 return NULL;
2317 }
2318 callresult = callresults;
2319 }
2320 /* step 2.5: allocate memory for the results of formating numbers */
2321 if (numbersize) {
2322 numberresults = PyObject_Malloc(numbersize);
2323 if (!numberresults) {
2324 PyErr_NoMemory();
2325 goto fail;
2326 }
2327 numberresult = numberresults;
2328 }
2329
2330 /* step 3: format numbers and figure out how large a buffer we need */
2331 for (f = format; *f; f++) {
2332 if (*f == '%') {
2333 const char* p;
2334 int longflag;
2335 int longlongflag;
2336 int size_tflag;
2337 int numprinted;
2338
2339 p = f;
2340 zeropad = (f[1] == '0');
2341 f = parse_format_flags(f, &width, &precision,
2342 &longflag, &longlongflag, &size_tflag);
2343 switch (*f) {
2344 case 'c':
2345 {
2346 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002347 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348 n++;
2349 break;
2350 }
2351 case '%':
2352 n++;
2353 break;
2354 case 'i':
2355 case 'd':
2356 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2357 width, precision, *f);
2358 if (longflag)
2359 numprinted = sprintf(numberresult, fmt,
2360 va_arg(count, long));
2361#ifdef HAVE_LONG_LONG
2362 else if (longlongflag)
2363 numprinted = sprintf(numberresult, fmt,
2364 va_arg(count, PY_LONG_LONG));
2365#endif
2366 else if (size_tflag)
2367 numprinted = sprintf(numberresult, fmt,
2368 va_arg(count, Py_ssize_t));
2369 else
2370 numprinted = sprintf(numberresult, fmt,
2371 va_arg(count, int));
2372 n += numprinted;
2373 /* advance by +1 to skip over the '\0' */
2374 numberresult += (numprinted + 1);
2375 assert(*(numberresult - 1) == '\0');
2376 assert(*(numberresult - 2) != '\0');
2377 assert(numprinted >= 0);
2378 assert(numberresult <= numberresults + numbersize);
2379 break;
2380 case 'u':
2381 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2382 width, precision, 'u');
2383 if (longflag)
2384 numprinted = sprintf(numberresult, fmt,
2385 va_arg(count, unsigned long));
2386#ifdef HAVE_LONG_LONG
2387 else if (longlongflag)
2388 numprinted = sprintf(numberresult, fmt,
2389 va_arg(count, unsigned PY_LONG_LONG));
2390#endif
2391 else if (size_tflag)
2392 numprinted = sprintf(numberresult, fmt,
2393 va_arg(count, size_t));
2394 else
2395 numprinted = sprintf(numberresult, fmt,
2396 va_arg(count, unsigned int));
2397 n += numprinted;
2398 numberresult += (numprinted + 1);
2399 assert(*(numberresult - 1) == '\0');
2400 assert(*(numberresult - 2) != '\0');
2401 assert(numprinted >= 0);
2402 assert(numberresult <= numberresults + numbersize);
2403 break;
2404 case 'x':
2405 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2406 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2407 n += numprinted;
2408 numberresult += (numprinted + 1);
2409 assert(*(numberresult - 1) == '\0');
2410 assert(*(numberresult - 2) != '\0');
2411 assert(numprinted >= 0);
2412 assert(numberresult <= numberresults + numbersize);
2413 break;
2414 case 'p':
2415 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2416 /* %p is ill-defined: ensure leading 0x. */
2417 if (numberresult[1] == 'X')
2418 numberresult[1] = 'x';
2419 else if (numberresult[1] != 'x') {
2420 memmove(numberresult + 2, numberresult,
2421 strlen(numberresult) + 1);
2422 numberresult[0] = '0';
2423 numberresult[1] = 'x';
2424 numprinted += 2;
2425 }
2426 n += numprinted;
2427 numberresult += (numprinted + 1);
2428 assert(*(numberresult - 1) == '\0');
2429 assert(*(numberresult - 2) != '\0');
2430 assert(numprinted >= 0);
2431 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002432 break;
2433 case 's':
2434 {
2435 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002436 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002437 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2438 if (!str)
2439 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440 /* since PyUnicode_DecodeUTF8 returns already flexible
2441 unicode objects, there is no need to call ready on them */
2442 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002443 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002445 /* Remember the str and switch to the next slot */
2446 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002447 break;
2448 }
2449 case 'U':
2450 {
2451 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002452 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 if (PyUnicode_READY(obj) == -1)
2454 goto fail;
2455 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002456 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002458 break;
2459 }
2460 case 'V':
2461 {
2462 PyObject *obj = va_arg(count, PyObject *);
2463 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002464 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002465 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002466 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002467 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 if (PyUnicode_READY(obj) == -1)
2469 goto fail;
2470 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002471 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002473 *callresult++ = NULL;
2474 }
2475 else {
2476 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2477 if (!str_obj)
2478 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002479 if (PyUnicode_READY(str_obj)) {
2480 Py_DECREF(str_obj);
2481 goto fail;
2482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002484 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002486 *callresult++ = str_obj;
2487 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002488 break;
2489 }
2490 case 'S':
2491 {
2492 PyObject *obj = va_arg(count, PyObject *);
2493 PyObject *str;
2494 assert(obj);
2495 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002497 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002498 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002499 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 /* Remember the str and switch to the next slot */
2502 *callresult++ = str;
2503 break;
2504 }
2505 case 'R':
2506 {
2507 PyObject *obj = va_arg(count, PyObject *);
2508 PyObject *repr;
2509 assert(obj);
2510 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002512 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002514 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002516 /* Remember the repr and switch to the next slot */
2517 *callresult++ = repr;
2518 break;
2519 }
2520 case 'A':
2521 {
2522 PyObject *obj = va_arg(count, PyObject *);
2523 PyObject *ascii;
2524 assert(obj);
2525 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002527 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002528 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002529 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002530 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002531 /* Remember the repr and switch to the next slot */
2532 *callresult++ = ascii;
2533 break;
2534 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002535 default:
2536 /* if we stumble upon an unknown
2537 formatting code, copy the rest of
2538 the format string to the output
2539 string. (we cannot just skip the
2540 code, since there's no way to know
2541 what's in the argument list) */
2542 n += strlen(p);
2543 goto expand;
2544 }
2545 } else
2546 n++;
2547 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002548 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002549 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002550 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002551 we don't have to resize the string.
2552 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002553 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002554 if (!string)
2555 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 kind = PyUnicode_KIND(string);
2557 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002559 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002563 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002564
2565 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2567 /* checking for == because the last argument could be a empty
2568 string, which causes i to point to end, the assert at the end of
2569 the loop */
2570 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002571
Benjamin Peterson14339b62009-01-31 16:36:08 +00002572 switch (*f) {
2573 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002574 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 const int ordinal = va_arg(vargs, int);
2576 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002578 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002579 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002580 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002581 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002583 case 'p':
2584 /* unused, since we already have the result */
2585 if (*f == 'p')
2586 (void) va_arg(vargs, void *);
2587 else
2588 (void) va_arg(vargs, int);
2589 /* extract the result from numberresults and append. */
2590 for (; *numberresult; ++i, ++numberresult)
2591 PyUnicode_WRITE(kind, data, i, *numberresult);
2592 /* skip over the separating '\0' */
2593 assert(*numberresult == '\0');
2594 numberresult++;
2595 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002596 break;
2597 case 's':
2598 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002599 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002601 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 size = PyUnicode_GET_LENGTH(*callresult);
2603 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002604 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002605 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002606 /* We're done with the unicode()/repr() => forget it */
2607 Py_DECREF(*callresult);
2608 /* switch to next unicode()/repr() result */
2609 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002610 break;
2611 }
2612 case 'U':
2613 {
2614 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 Py_ssize_t size;
2616 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2617 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002618 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002620 break;
2621 }
2622 case 'V':
2623 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002624 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002625 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002626 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 size = PyUnicode_GET_LENGTH(obj);
2629 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002630 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002631 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633 size = PyUnicode_GET_LENGTH(*callresult);
2634 assert(PyUnicode_KIND(*callresult) <=
2635 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002636 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002638 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002640 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 break;
2642 }
2643 case 'S':
2644 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002645 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002647 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 /* unused, since we already have the result */
2649 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002650 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002651 copy_characters(string, i, *callresult, 0, size);
2652 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002653 /* We're done with the unicode()/repr() => forget it */
2654 Py_DECREF(*callresult);
2655 /* switch to next unicode()/repr() result */
2656 ++callresult;
2657 break;
2658 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002660 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 break;
2662 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 for (; *p; ++p, ++i)
2664 PyUnicode_WRITE(kind, data, i, *p);
2665 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 goto end;
2667 }
Victor Stinner1205f272010-09-11 00:54:47 +00002668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 else {
2670 assert(i < PyUnicode_GET_LENGTH(string));
2671 PyUnicode_WRITE(kind, data, i++, *f);
2672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002675
Benjamin Peterson29060642009-01-31 22:14:21 +00002676 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 if (callresults)
2678 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002679 if (numberresults)
2680 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002681 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002682 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 if (callresults) {
2684 PyObject **callresult2 = callresults;
2685 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002686 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002687 ++callresult2;
2688 }
2689 PyObject_Free(callresults);
2690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002691 if (numberresults)
2692 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002693 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002694}
2695
Walter Dörwaldd2034312007-05-18 16:29:38 +00002696PyObject *
2697PyUnicode_FromFormat(const char *format, ...)
2698{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 PyObject* ret;
2700 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701
2702#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002703 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002704#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002706#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 ret = PyUnicode_FromFormatV(format, vargs);
2708 va_end(vargs);
2709 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002710}
2711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002712#ifdef HAVE_WCHAR_H
2713
Victor Stinner5593d8a2010-10-02 11:11:27 +00002714/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2715 convert a Unicode object to a wide character string.
2716
Victor Stinnerd88d9832011-09-06 02:00:05 +02002717 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002718 character) required to convert the unicode object. Ignore size argument.
2719
Victor Stinnerd88d9832011-09-06 02:00:05 +02002720 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002722 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002723static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002724unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002725 wchar_t *w,
2726 Py_ssize_t size)
2727{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002728 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 const wchar_t *wstr;
2730
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002731 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002732 if (wstr == NULL)
2733 return -1;
2734
Victor Stinner5593d8a2010-10-02 11:11:27 +00002735 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736 if (size > res)
2737 size = res + 1;
2738 else
2739 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002741 return res;
2742 }
2743 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002744 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002745}
2746
2747Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002748PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002749 wchar_t *w,
2750 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002751{
2752 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002753 PyErr_BadInternalCall();
2754 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002756 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757}
2758
Victor Stinner137c34c2010-09-29 10:25:54 +00002759wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002760PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002761 Py_ssize_t *size)
2762{
2763 wchar_t* buffer;
2764 Py_ssize_t buflen;
2765
2766 if (unicode == NULL) {
2767 PyErr_BadInternalCall();
2768 return NULL;
2769 }
2770
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002771 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002772 if (buflen == -1)
2773 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002774 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002775 PyErr_NoMemory();
2776 return NULL;
2777 }
2778
Victor Stinner137c34c2010-09-29 10:25:54 +00002779 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2780 if (buffer == NULL) {
2781 PyErr_NoMemory();
2782 return NULL;
2783 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002784 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 if (buflen == -1)
2786 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002787 if (size != NULL)
2788 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002789 return buffer;
2790}
2791
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793
Alexander Belopolsky40018472011-02-26 01:02:56 +00002794PyObject *
2795PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002798 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002799 PyErr_SetString(PyExc_ValueError,
2800 "chr() arg not in range(0x110000)");
2801 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002802 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002804 if (ordinal < 256)
2805 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002807 v = PyUnicode_New(1, ordinal);
2808 if (v == NULL)
2809 return NULL;
2810 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002811 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002813}
2814
Alexander Belopolsky40018472011-02-26 01:02:56 +00002815PyObject *
2816PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002818 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002819 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002820 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002821 if (PyUnicode_READY(obj))
2822 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002823 Py_INCREF(obj);
2824 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002825 }
2826 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002827 /* For a Unicode subtype that's not a Unicode object,
2828 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002829 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002830 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002831 PyErr_Format(PyExc_TypeError,
2832 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002833 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002834 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002835}
2836
Alexander Belopolsky40018472011-02-26 01:02:56 +00002837PyObject *
2838PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002839 const char *encoding,
2840 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002841{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002842 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002843 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002844
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 PyErr_BadInternalCall();
2847 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002849
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002850 /* Decoding bytes objects is the most common case and should be fast */
2851 if (PyBytes_Check(obj)) {
2852 if (PyBytes_GET_SIZE(obj) == 0) {
2853 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002854 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002855 }
2856 else {
2857 v = PyUnicode_Decode(
2858 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2859 encoding, errors);
2860 }
2861 return v;
2862 }
2863
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002864 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002865 PyErr_SetString(PyExc_TypeError,
2866 "decoding str is not supported");
2867 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002868 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002869
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002870 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2871 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2872 PyErr_Format(PyExc_TypeError,
2873 "coercing to str: need bytes, bytearray "
2874 "or buffer-like object, %.80s found",
2875 Py_TYPE(obj)->tp_name);
2876 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002877 }
Tim Petersced69f82003-09-16 20:30:58 +00002878
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002880 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002881 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002882 }
Tim Petersced69f82003-09-16 20:30:58 +00002883 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002884 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002885
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002887 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888}
2889
Victor Stinner600d3be2010-06-10 12:00:55 +00002890/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002891 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2892 1 on success. */
2893static int
2894normalize_encoding(const char *encoding,
2895 char *lower,
2896 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002898 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002899 char *l;
2900 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002901
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002902 if (encoding == NULL) {
2903 strcpy(lower, "utf-8");
2904 return 1;
2905 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002906 e = encoding;
2907 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002908 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002909 while (*e) {
2910 if (l == l_end)
2911 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002912 if (Py_ISUPPER(*e)) {
2913 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002914 }
2915 else if (*e == '_') {
2916 *l++ = '-';
2917 e++;
2918 }
2919 else {
2920 *l++ = *e++;
2921 }
2922 }
2923 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002924 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002925}
2926
Alexander Belopolsky40018472011-02-26 01:02:56 +00002927PyObject *
2928PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002929 Py_ssize_t size,
2930 const char *encoding,
2931 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002932{
2933 PyObject *buffer = NULL, *unicode;
2934 Py_buffer info;
2935 char lower[11]; /* Enough for any encoding shortcut */
2936
Fred Drakee4315f52000-05-09 19:53:39 +00002937 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002938 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002939 if ((strcmp(lower, "utf-8") == 0) ||
2940 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002941 return PyUnicode_DecodeUTF8(s, size, errors);
2942 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002943 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002944 (strcmp(lower, "iso-8859-1") == 0))
2945 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002946#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002947 else if (strcmp(lower, "mbcs") == 0)
2948 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002949#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002950 else if (strcmp(lower, "ascii") == 0)
2951 return PyUnicode_DecodeASCII(s, size, errors);
2952 else if (strcmp(lower, "utf-16") == 0)
2953 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2954 else if (strcmp(lower, "utf-32") == 0)
2955 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2956 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002957
2958 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002959 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002960 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002961 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002962 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963 if (buffer == NULL)
2964 goto onError;
2965 unicode = PyCodec_Decode(buffer, encoding, errors);
2966 if (unicode == NULL)
2967 goto onError;
2968 if (!PyUnicode_Check(unicode)) {
2969 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002970 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002971 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 Py_DECREF(unicode);
2973 goto onError;
2974 }
2975 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002976 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002977
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 Py_XDECREF(buffer);
2980 return NULL;
2981}
2982
Alexander Belopolsky40018472011-02-26 01:02:56 +00002983PyObject *
2984PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002985 const char *encoding,
2986 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002987{
2988 PyObject *v;
2989
2990 if (!PyUnicode_Check(unicode)) {
2991 PyErr_BadArgument();
2992 goto onError;
2993 }
2994
2995 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002996 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002997
2998 /* Decode via the codec registry */
2999 v = PyCodec_Decode(unicode, encoding, errors);
3000 if (v == NULL)
3001 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003002 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003003
Benjamin Peterson29060642009-01-31 22:14:21 +00003004 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003005 return NULL;
3006}
3007
Alexander Belopolsky40018472011-02-26 01:02:56 +00003008PyObject *
3009PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003010 const char *encoding,
3011 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003012{
3013 PyObject *v;
3014
3015 if (!PyUnicode_Check(unicode)) {
3016 PyErr_BadArgument();
3017 goto onError;
3018 }
3019
3020 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003021 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003022
3023 /* Decode via the codec registry */
3024 v = PyCodec_Decode(unicode, encoding, errors);
3025 if (v == NULL)
3026 goto onError;
3027 if (!PyUnicode_Check(v)) {
3028 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003029 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003030 Py_TYPE(v)->tp_name);
3031 Py_DECREF(v);
3032 goto onError;
3033 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003034 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003035
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003037 return NULL;
3038}
3039
Alexander Belopolsky40018472011-02-26 01:02:56 +00003040PyObject *
3041PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003042 Py_ssize_t size,
3043 const char *encoding,
3044 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045{
3046 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003047
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048 unicode = PyUnicode_FromUnicode(s, size);
3049 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3052 Py_DECREF(unicode);
3053 return v;
3054}
3055
Alexander Belopolsky40018472011-02-26 01:02:56 +00003056PyObject *
3057PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003058 const char *encoding,
3059 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003060{
3061 PyObject *v;
3062
3063 if (!PyUnicode_Check(unicode)) {
3064 PyErr_BadArgument();
3065 goto onError;
3066 }
3067
3068 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003069 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003070
3071 /* Encode via the codec registry */
3072 v = PyCodec_Encode(unicode, encoding, errors);
3073 if (v == NULL)
3074 goto onError;
3075 return v;
3076
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003078 return NULL;
3079}
3080
Victor Stinnerad158722010-10-27 00:25:46 +00003081PyObject *
3082PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003083{
Victor Stinner99b95382011-07-04 14:23:54 +02003084#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003085 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003086#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003087 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003088#else
Victor Stinner793b5312011-04-27 00:24:21 +02003089 PyInterpreterState *interp = PyThreadState_GET()->interp;
3090 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3091 cannot use it to encode and decode filenames before it is loaded. Load
3092 the Python codec requires to encode at least its own filename. Use the C
3093 version of the locale codec until the codec registry is initialized and
3094 the Python codec is loaded.
3095
3096 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3097 cannot only rely on it: check also interp->fscodec_initialized for
3098 subinterpreters. */
3099 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003100 return PyUnicode_AsEncodedString(unicode,
3101 Py_FileSystemDefaultEncoding,
3102 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003103 }
3104 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003105 /* locale encoding with surrogateescape */
3106 wchar_t *wchar;
3107 char *bytes;
3108 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003109 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003110
3111 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3112 if (wchar == NULL)
3113 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003114 bytes = _Py_wchar2char(wchar, &error_pos);
3115 if (bytes == NULL) {
3116 if (error_pos != (size_t)-1) {
3117 char *errmsg = strerror(errno);
3118 PyObject *exc = NULL;
3119 if (errmsg == NULL)
3120 errmsg = "Py_wchar2char() failed";
3121 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003122 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003123 error_pos, error_pos+1,
3124 errmsg);
3125 Py_XDECREF(exc);
3126 }
3127 else
3128 PyErr_NoMemory();
3129 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003130 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003131 }
3132 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003133
3134 bytes_obj = PyBytes_FromString(bytes);
3135 PyMem_Free(bytes);
3136 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003137 }
Victor Stinnerad158722010-10-27 00:25:46 +00003138#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003139}
3140
Alexander Belopolsky40018472011-02-26 01:02:56 +00003141PyObject *
3142PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003143 const char *encoding,
3144 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003145{
3146 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003147 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003148
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 if (!PyUnicode_Check(unicode)) {
3150 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003151 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003152 }
Fred Drakee4315f52000-05-09 19:53:39 +00003153
Fred Drakee4315f52000-05-09 19:53:39 +00003154 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003155 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003156 if ((strcmp(lower, "utf-8") == 0) ||
3157 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003158 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003159 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003160 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003161 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003162 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003163 }
Victor Stinner37296e82010-06-10 13:36:23 +00003164 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003165 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003166 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003167 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003168#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003169 else if (strcmp(lower, "mbcs") == 0)
3170 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003171#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003172 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003173 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003175
3176 /* Encode via the codec registry */
3177 v = PyCodec_Encode(unicode, encoding, errors);
3178 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003179 return NULL;
3180
3181 /* The normal path */
3182 if (PyBytes_Check(v))
3183 return v;
3184
3185 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003186 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003187 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003188 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003189
3190 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3191 "encoder %s returned bytearray instead of bytes",
3192 encoding);
3193 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003194 Py_DECREF(v);
3195 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003196 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003197
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003198 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3199 Py_DECREF(v);
3200 return b;
3201 }
3202
3203 PyErr_Format(PyExc_TypeError,
3204 "encoder did not return a bytes object (type=%.400s)",
3205 Py_TYPE(v)->tp_name);
3206 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003207 return NULL;
3208}
3209
Alexander Belopolsky40018472011-02-26 01:02:56 +00003210PyObject *
3211PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003212 const char *encoding,
3213 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003214{
3215 PyObject *v;
3216
3217 if (!PyUnicode_Check(unicode)) {
3218 PyErr_BadArgument();
3219 goto onError;
3220 }
3221
3222 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003224
3225 /* Encode via the codec registry */
3226 v = PyCodec_Encode(unicode, encoding, errors);
3227 if (v == NULL)
3228 goto onError;
3229 if (!PyUnicode_Check(v)) {
3230 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003231 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003232 Py_TYPE(v)->tp_name);
3233 Py_DECREF(v);
3234 goto onError;
3235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003237
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 return NULL;
3240}
3241
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003242PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003243PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003244 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003245 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3246}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003247
Christian Heimes5894ba72007-11-04 11:43:14 +00003248PyObject*
3249PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3250{
Victor Stinner99b95382011-07-04 14:23:54 +02003251#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003252 return PyUnicode_DecodeMBCS(s, size, NULL);
3253#elif defined(__APPLE__)
3254 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3255#else
Victor Stinner793b5312011-04-27 00:24:21 +02003256 PyInterpreterState *interp = PyThreadState_GET()->interp;
3257 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3258 cannot use it to encode and decode filenames before it is loaded. Load
3259 the Python codec requires to encode at least its own filename. Use the C
3260 version of the locale codec until the codec registry is initialized and
3261 the Python codec is loaded.
3262
3263 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3264 cannot only rely on it: check also interp->fscodec_initialized for
3265 subinterpreters. */
3266 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003267 return PyUnicode_Decode(s, size,
3268 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003269 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003270 }
3271 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003272 /* locale encoding with surrogateescape */
3273 wchar_t *wchar;
3274 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003275 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003276
3277 if (s[size] != '\0' || size != strlen(s)) {
3278 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3279 return NULL;
3280 }
3281
Victor Stinner168e1172010-10-16 23:16:16 +00003282 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003283 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003284 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003285
Victor Stinner168e1172010-10-16 23:16:16 +00003286 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003287 PyMem_Free(wchar);
3288 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003289 }
Victor Stinnerad158722010-10-27 00:25:46 +00003290#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003291}
3292
Martin v. Löwis011e8422009-05-05 04:43:17 +00003293
3294int
3295PyUnicode_FSConverter(PyObject* arg, void* addr)
3296{
3297 PyObject *output = NULL;
3298 Py_ssize_t size;
3299 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003300 if (arg == NULL) {
3301 Py_DECREF(*(PyObject**)addr);
3302 return 1;
3303 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003304 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003305 output = arg;
3306 Py_INCREF(output);
3307 }
3308 else {
3309 arg = PyUnicode_FromObject(arg);
3310 if (!arg)
3311 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003312 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003313 Py_DECREF(arg);
3314 if (!output)
3315 return 0;
3316 if (!PyBytes_Check(output)) {
3317 Py_DECREF(output);
3318 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3319 return 0;
3320 }
3321 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003322 size = PyBytes_GET_SIZE(output);
3323 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003324 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003325 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003326 Py_DECREF(output);
3327 return 0;
3328 }
3329 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003330 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003331}
3332
3333
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003334int
3335PyUnicode_FSDecoder(PyObject* arg, void* addr)
3336{
3337 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003338 if (arg == NULL) {
3339 Py_DECREF(*(PyObject**)addr);
3340 return 1;
3341 }
3342 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343 if (PyUnicode_READY(arg))
3344 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003345 output = arg;
3346 Py_INCREF(output);
3347 }
3348 else {
3349 arg = PyBytes_FromObject(arg);
3350 if (!arg)
3351 return 0;
3352 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3353 PyBytes_GET_SIZE(arg));
3354 Py_DECREF(arg);
3355 if (!output)
3356 return 0;
3357 if (!PyUnicode_Check(output)) {
3358 Py_DECREF(output);
3359 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3360 return 0;
3361 }
3362 }
Victor Stinner065836e2011-10-27 01:56:33 +02003363 if (PyUnicode_READY(output) < 0) {
3364 Py_DECREF(output);
3365 return 0;
3366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003367 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003368 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003369 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3370 Py_DECREF(output);
3371 return 0;
3372 }
3373 *(PyObject**)addr = output;
3374 return Py_CLEANUP_SUPPORTED;
3375}
3376
3377
Martin v. Löwis5b222132007-06-10 09:51:05 +00003378char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003379PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003380{
Christian Heimesf3863112007-11-22 07:46:41 +00003381 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003382
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003383 if (!PyUnicode_Check(unicode)) {
3384 PyErr_BadArgument();
3385 return NULL;
3386 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003387 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003388 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003389
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003390 if (PyUnicode_UTF8(unicode) == NULL) {
3391 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003392 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3393 if (bytes == NULL)
3394 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003395 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3396 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003397 Py_DECREF(bytes);
3398 return NULL;
3399 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003400 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3401 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3402 PyBytes_AS_STRING(bytes),
3403 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003404 Py_DECREF(bytes);
3405 }
3406
3407 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003408 *psize = PyUnicode_UTF8_LENGTH(unicode);
3409 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003410}
3411
3412char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003413PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3416}
3417
3418#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003419static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003420#endif
3421
3422
3423Py_UNICODE *
3424PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426 const unsigned char *one_byte;
3427#if SIZEOF_WCHAR_T == 4
3428 const Py_UCS2 *two_bytes;
3429#else
3430 const Py_UCS4 *four_bytes;
3431 const Py_UCS4 *ucs4_end;
3432 Py_ssize_t num_surrogates;
3433#endif
3434 wchar_t *w;
3435 wchar_t *wchar_end;
3436
3437 if (!PyUnicode_Check(unicode)) {
3438 PyErr_BadArgument();
3439 return NULL;
3440 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003441 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003442 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003443 assert(_PyUnicode_KIND(unicode) != 0);
3444 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003445
3446#ifdef Py_DEBUG
3447 ++unicode_as_unicode_calls;
3448#endif
3449
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003450 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003451#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003452 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3453 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003454 num_surrogates = 0;
3455
3456 for (; four_bytes < ucs4_end; ++four_bytes) {
3457 if (*four_bytes > 0xFFFF)
3458 ++num_surrogates;
3459 }
3460
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003461 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3462 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3463 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003464 PyErr_NoMemory();
3465 return NULL;
3466 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003467 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003468
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003469 w = _PyUnicode_WSTR(unicode);
3470 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3471 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003472 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3473 if (*four_bytes > 0xFFFF) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01003474 assert(*four_bytes <= 0x10FFFF);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003475 /* encode surrogate pair in this case */
3476 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3477 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3478 }
3479 else
3480 *w = *four_bytes;
3481
3482 if (w > wchar_end) {
3483 assert(0 && "Miscalculated string end");
3484 }
3485 }
3486 *w = 0;
3487#else
3488 /* sizeof(wchar_t) == 4 */
3489 Py_FatalError("Impossible unicode object state, wstr and str "
3490 "should share memory already.");
3491 return NULL;
3492#endif
3493 }
3494 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003495 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3496 (_PyUnicode_LENGTH(unicode) + 1));
3497 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003498 PyErr_NoMemory();
3499 return NULL;
3500 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003501 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3502 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3503 w = _PyUnicode_WSTR(unicode);
3504 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003505
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003506 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3507 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003508 for (; w < wchar_end; ++one_byte, ++w)
3509 *w = *one_byte;
3510 /* null-terminate the wstr */
3511 *w = 0;
3512 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003513 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003514#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003515 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003516 for (; w < wchar_end; ++two_bytes, ++w)
3517 *w = *two_bytes;
3518 /* null-terminate the wstr */
3519 *w = 0;
3520#else
3521 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003522 PyObject_FREE(_PyUnicode_WSTR(unicode));
3523 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003524 Py_FatalError("Impossible unicode object state, wstr "
3525 "and str should share memory already.");
3526 return NULL;
3527#endif
3528 }
3529 else {
3530 assert(0 && "This should never happen.");
3531 }
3532 }
3533 }
3534 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003535 *size = PyUnicode_WSTR_LENGTH(unicode);
3536 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003537}
3538
Alexander Belopolsky40018472011-02-26 01:02:56 +00003539Py_UNICODE *
3540PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003542 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543}
3544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003545
Alexander Belopolsky40018472011-02-26 01:02:56 +00003546Py_ssize_t
3547PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548{
3549 if (!PyUnicode_Check(unicode)) {
3550 PyErr_BadArgument();
3551 goto onError;
3552 }
3553 return PyUnicode_GET_SIZE(unicode);
3554
Benjamin Peterson29060642009-01-31 22:14:21 +00003555 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 return -1;
3557}
3558
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003559Py_ssize_t
3560PyUnicode_GetLength(PyObject *unicode)
3561{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003562 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003563 PyErr_BadArgument();
3564 return -1;
3565 }
3566
3567 return PyUnicode_GET_LENGTH(unicode);
3568}
3569
3570Py_UCS4
3571PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3572{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003573 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3574 PyErr_BadArgument();
3575 return (Py_UCS4)-1;
3576 }
3577 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3578 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003579 return (Py_UCS4)-1;
3580 }
3581 return PyUnicode_READ_CHAR(unicode, index);
3582}
3583
3584int
3585PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3586{
3587 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003588 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003589 return -1;
3590 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003591 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3592 PyErr_SetString(PyExc_IndexError, "string index out of range");
3593 return -1;
3594 }
3595 if (_PyUnicode_Dirty(unicode))
3596 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003597 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3598 index, ch);
3599 return 0;
3600}
3601
Alexander Belopolsky40018472011-02-26 01:02:56 +00003602const char *
3603PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003604{
Victor Stinner42cb4622010-09-01 19:39:01 +00003605 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003606}
3607
Victor Stinner554f3f02010-06-16 23:33:54 +00003608/* create or adjust a UnicodeDecodeError */
3609static void
3610make_decode_exception(PyObject **exceptionObject,
3611 const char *encoding,
3612 const char *input, Py_ssize_t length,
3613 Py_ssize_t startpos, Py_ssize_t endpos,
3614 const char *reason)
3615{
3616 if (*exceptionObject == NULL) {
3617 *exceptionObject = PyUnicodeDecodeError_Create(
3618 encoding, input, length, startpos, endpos, reason);
3619 }
3620 else {
3621 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3622 goto onError;
3623 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3624 goto onError;
3625 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3626 goto onError;
3627 }
3628 return;
3629
3630onError:
3631 Py_DECREF(*exceptionObject);
3632 *exceptionObject = NULL;
3633}
3634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635/* error handling callback helper:
3636 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003637 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003638 and adjust various state variables.
3639 return 0 on success, -1 on error
3640*/
3641
Alexander Belopolsky40018472011-02-26 01:02:56 +00003642static int
3643unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003644 const char *encoding, const char *reason,
3645 const char **input, const char **inend, Py_ssize_t *startinpos,
3646 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003647 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003649 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650
3651 PyObject *restuple = NULL;
3652 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003653 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003654 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003655 Py_ssize_t requiredsize;
3656 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003657 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 int res = -1;
3659
Victor Stinner596a6c42011-11-09 00:02:18 +01003660 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3661 outsize = PyUnicode_GET_LENGTH(*output);
3662 else
3663 outsize = _PyUnicode_WSTR_LENGTH(*output);
3664
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003666 *errorHandler = PyCodec_LookupError(errors);
3667 if (*errorHandler == NULL)
3668 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003669 }
3670
Victor Stinner554f3f02010-06-16 23:33:54 +00003671 make_decode_exception(exceptionObject,
3672 encoding,
3673 *input, *inend - *input,
3674 *startinpos, *endinpos,
3675 reason);
3676 if (*exceptionObject == NULL)
3677 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678
3679 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3680 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003681 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003683 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003684 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 }
3686 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003687 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003688 if (PyUnicode_READY(repunicode) < 0)
3689 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003690
3691 /* Copy back the bytes variables, which might have been modified by the
3692 callback */
3693 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3694 if (!inputobj)
3695 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003696 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003698 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003699 *input = PyBytes_AS_STRING(inputobj);
3700 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003701 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003702 /* we can DECREF safely, as the exception has another reference,
3703 so the object won't go away. */
3704 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003705
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003706 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003708 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003709 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3710 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003711 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003712
Victor Stinner596a6c42011-11-09 00:02:18 +01003713 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3714 /* need more space? (at least enough for what we
3715 have+the replacement+the rest of the string (starting
3716 at the new input position), so we won't have to check space
3717 when there are no errors in the rest of the string) */
3718 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3719 requiredsize = *outpos + replen + insize-newpos;
3720 if (requiredsize > outsize) {
3721 if (requiredsize<2*outsize)
3722 requiredsize = 2*outsize;
3723 if (unicode_resize(output, requiredsize) < 0)
3724 goto onError;
3725 }
3726 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003727 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003728 copy_characters(*output, *outpos, repunicode, 0, replen);
3729 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003731 else {
3732 wchar_t *repwstr;
3733 Py_ssize_t repwlen;
3734 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3735 if (repwstr == NULL)
3736 goto onError;
3737 /* need more space? (at least enough for what we
3738 have+the replacement+the rest of the string (starting
3739 at the new input position), so we won't have to check space
3740 when there are no errors in the rest of the string) */
3741 requiredsize = *outpos + repwlen + insize-newpos;
3742 if (requiredsize > outsize) {
3743 if (requiredsize < 2*outsize)
3744 requiredsize = 2*outsize;
3745 if (unicode_resize(output, requiredsize) < 0)
3746 goto onError;
3747 }
3748 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3749 *outpos += repwlen;
3750 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003751 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003752 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003753
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003754 /* we made it! */
3755 res = 0;
3756
Benjamin Peterson29060642009-01-31 22:14:21 +00003757 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 Py_XDECREF(restuple);
3759 return res;
3760}
3761
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003762/* --- UTF-7 Codec -------------------------------------------------------- */
3763
Antoine Pitrou244651a2009-05-04 18:56:13 +00003764/* See RFC2152 for details. We encode conservatively and decode liberally. */
3765
3766/* Three simple macros defining base-64. */
3767
3768/* Is c a base-64 character? */
3769
3770#define IS_BASE64(c) \
3771 (((c) >= 'A' && (c) <= 'Z') || \
3772 ((c) >= 'a' && (c) <= 'z') || \
3773 ((c) >= '0' && (c) <= '9') || \
3774 (c) == '+' || (c) == '/')
3775
3776/* given that c is a base-64 character, what is its base-64 value? */
3777
3778#define FROM_BASE64(c) \
3779 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3780 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3781 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3782 (c) == '+' ? 62 : 63)
3783
3784/* What is the base-64 character of the bottom 6 bits of n? */
3785
3786#define TO_BASE64(n) \
3787 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3788
3789/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3790 * decoded as itself. We are permissive on decoding; the only ASCII
3791 * byte not decoding to itself is the + which begins a base64
3792 * string. */
3793
3794#define DECODE_DIRECT(c) \
3795 ((c) <= 127 && (c) != '+')
3796
3797/* The UTF-7 encoder treats ASCII characters differently according to
3798 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3799 * the above). See RFC2152. This array identifies these different
3800 * sets:
3801 * 0 : "Set D"
3802 * alphanumeric and '(),-./:?
3803 * 1 : "Set O"
3804 * !"#$%&*;<=>@[]^_`{|}
3805 * 2 : "whitespace"
3806 * ht nl cr sp
3807 * 3 : special (must be base64 encoded)
3808 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3809 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003810
Tim Petersced69f82003-09-16 20:30:58 +00003811static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003812char utf7_category[128] = {
3813/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3814 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3815/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3816 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3817/* sp ! " # $ % & ' ( ) * + , - . / */
3818 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3819/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3821/* @ A B C D E F G H I J K L M N O */
3822 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3823/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3825/* ` a b c d e f g h i j k l m n o */
3826 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3827/* p q r s t u v w x y z { | } ~ del */
3828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003829};
3830
Antoine Pitrou244651a2009-05-04 18:56:13 +00003831/* ENCODE_DIRECT: this character should be encoded as itself. The
3832 * answer depends on whether we are encoding set O as itself, and also
3833 * on whether we are encoding whitespace as itself. RFC2152 makes it
3834 * clear that the answers to these questions vary between
3835 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003836
Antoine Pitrou244651a2009-05-04 18:56:13 +00003837#define ENCODE_DIRECT(c, directO, directWS) \
3838 ((c) < 128 && (c) > 0 && \
3839 ((utf7_category[(c)] == 0) || \
3840 (directWS && (utf7_category[(c)] == 2)) || \
3841 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003842
Alexander Belopolsky40018472011-02-26 01:02:56 +00003843PyObject *
3844PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003845 Py_ssize_t size,
3846 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003847{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003848 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3849}
3850
Antoine Pitrou244651a2009-05-04 18:56:13 +00003851/* The decoder. The only state we preserve is our read position,
3852 * i.e. how many characters we have consumed. So if we end in the
3853 * middle of a shift sequence we have to back off the read position
3854 * and the output to the beginning of the sequence, otherwise we lose
3855 * all the shift state (seen bits, number of bits seen, high
3856 * surrogate). */
3857
Alexander Belopolsky40018472011-02-26 01:02:56 +00003858PyObject *
3859PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003860 Py_ssize_t size,
3861 const char *errors,
3862 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003863{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003864 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003865 Py_ssize_t startinpos;
3866 Py_ssize_t endinpos;
3867 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003868 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003869 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003870 const char *errmsg = "";
3871 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003872 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003873 unsigned int base64bits = 0;
3874 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003875 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003876 PyObject *errorHandler = NULL;
3877 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003878
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003879 /* Start off assuming it's all ASCII. Widen later as necessary. */
3880 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003881 if (!unicode)
3882 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003883 if (size == 0) {
3884 if (consumed)
3885 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003886 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003887 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003888
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003889 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003890 e = s + size;
3891
3892 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003893 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003894 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003895 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003896
Antoine Pitrou244651a2009-05-04 18:56:13 +00003897 if (inShift) { /* in a base-64 section */
3898 if (IS_BASE64(ch)) { /* consume a base-64 character */
3899 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3900 base64bits += 6;
3901 s++;
3902 if (base64bits >= 16) {
3903 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003904 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003905 base64bits -= 16;
3906 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3907 if (surrogate) {
3908 /* expecting a second surrogate */
3909 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003910 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3911 | (outCh & 0x3FF)) + 0x10000;
3912 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3913 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003914 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003915 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003916 }
3917 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003918 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3919 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003920 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003921 }
3922 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003923 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924 /* first surrogate */
3925 surrogate = outCh;
3926 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003927 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003928 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3929 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 }
3931 }
3932 }
3933 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003934 inShift = 0;
3935 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003936 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003937 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3938 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003939 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003940 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003941 if (base64bits > 0) { /* left-over bits */
3942 if (base64bits >= 6) {
3943 /* We've seen at least one base-64 character */
3944 errmsg = "partial character in shift sequence";
3945 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003946 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003947 else {
3948 /* Some bits remain; they should be zero */
3949 if (base64buffer != 0) {
3950 errmsg = "non-zero padding bits in shift sequence";
3951 goto utf7Error;
3952 }
3953 }
3954 }
3955 if (ch != '-') {
3956 /* '-' is absorbed; other terminating
3957 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003958 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3959 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003960 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003961 }
3962 }
3963 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003965 s++; /* consume '+' */
3966 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003967 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003968 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3969 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003970 }
3971 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003972 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003973 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003974 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003975 }
3976 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003977 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003978 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3979 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003980 s++;
3981 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003982 else {
3983 startinpos = s-starts;
3984 s++;
3985 errmsg = "unexpected special character";
3986 goto utf7Error;
3987 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003988 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003989utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003990 endinpos = s-starts;
3991 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 errors, &errorHandler,
3993 "utf7", errmsg,
3994 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003995 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003997 }
3998
Antoine Pitrou244651a2009-05-04 18:56:13 +00003999 /* end of string */
4000
4001 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4002 /* if we're in an inconsistent state, that's an error */
4003 if (surrogate ||
4004 (base64bits >= 6) ||
4005 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004006 endinpos = size;
4007 if (unicode_decode_call_errorhandler(
4008 errors, &errorHandler,
4009 "utf7", "unterminated shift sequence",
4010 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004011 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004012 goto onError;
4013 if (s < e)
4014 goto restart;
4015 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004016 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004017
4018 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004019 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004020 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004021 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004022 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004023 }
4024 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004025 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004026 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004027 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004028
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004029 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004030 goto onError;
4031
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004032 Py_XDECREF(errorHandler);
4033 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004034 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004035
Benjamin Peterson29060642009-01-31 22:14:21 +00004036 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004037 Py_XDECREF(errorHandler);
4038 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004039 Py_DECREF(unicode);
4040 return NULL;
4041}
4042
4043
Alexander Belopolsky40018472011-02-26 01:02:56 +00004044PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004045_PyUnicode_EncodeUTF7(PyObject *str,
4046 int base64SetO,
4047 int base64WhiteSpace,
4048 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004049{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004050 int kind;
4051 void *data;
4052 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004053 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004054 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004055 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004056 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004057 unsigned int base64bits = 0;
4058 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004059 char * out;
4060 char * start;
4061
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004062 if (PyUnicode_READY(str) < 0)
4063 return NULL;
4064 kind = PyUnicode_KIND(str);
4065 data = PyUnicode_DATA(str);
4066 len = PyUnicode_GET_LENGTH(str);
4067
4068 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004069 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004070
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004071 /* It might be possible to tighten this worst case */
4072 allocated = 8 * len;
4073 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004074 return PyErr_NoMemory();
4075
Antoine Pitrou244651a2009-05-04 18:56:13 +00004076 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004077 if (v == NULL)
4078 return NULL;
4079
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004080 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004081 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004082 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004083
Antoine Pitrou244651a2009-05-04 18:56:13 +00004084 if (inShift) {
4085 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4086 /* shifting out */
4087 if (base64bits) { /* output remaining bits */
4088 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4089 base64buffer = 0;
4090 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004091 }
4092 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004093 /* Characters not in the BASE64 set implicitly unshift the sequence
4094 so no '-' is required, except if the character is itself a '-' */
4095 if (IS_BASE64(ch) || ch == '-') {
4096 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004097 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004098 *out++ = (char) ch;
4099 }
4100 else {
4101 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004102 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004103 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004104 else { /* not in a shift sequence */
4105 if (ch == '+') {
4106 *out++ = '+';
4107 *out++ = '-';
4108 }
4109 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4110 *out++ = (char) ch;
4111 }
4112 else {
4113 *out++ = '+';
4114 inShift = 1;
4115 goto encode_char;
4116 }
4117 }
4118 continue;
4119encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004120 if (ch >= 0x10000) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01004121 assert(ch <= 0x10FFFF);
4122
Antoine Pitrou244651a2009-05-04 18:56:13 +00004123 /* code first surrogate */
4124 base64bits += 16;
4125 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4126 while (base64bits >= 6) {
4127 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4128 base64bits -= 6;
4129 }
4130 /* prepare second surrogate */
4131 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4132 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004133 base64bits += 16;
4134 base64buffer = (base64buffer << 16) | ch;
4135 while (base64bits >= 6) {
4136 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4137 base64bits -= 6;
4138 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004139 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004140 if (base64bits)
4141 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4142 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004143 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004144 if (_PyBytes_Resize(&v, out - start) < 0)
4145 return NULL;
4146 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004147}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004148PyObject *
4149PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4150 Py_ssize_t size,
4151 int base64SetO,
4152 int base64WhiteSpace,
4153 const char *errors)
4154{
4155 PyObject *result;
4156 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4157 if (tmp == NULL)
4158 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004159 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004160 base64WhiteSpace, errors);
4161 Py_DECREF(tmp);
4162 return result;
4163}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004164
Antoine Pitrou244651a2009-05-04 18:56:13 +00004165#undef IS_BASE64
4166#undef FROM_BASE64
4167#undef TO_BASE64
4168#undef DECODE_DIRECT
4169#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004170
Guido van Rossumd57fd912000-03-10 22:53:23 +00004171/* --- UTF-8 Codec -------------------------------------------------------- */
4172
Tim Petersced69f82003-09-16 20:30:58 +00004173static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004175 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4176 illegal prefix. See RFC 3629 for details */
4177 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4178 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004179 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4181 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4182 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4185 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4187 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004188 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4189 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4190 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4191 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4192 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193};
4194
Alexander Belopolsky40018472011-02-26 01:02:56 +00004195PyObject *
4196PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004197 Py_ssize_t size,
4198 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199{
Walter Dörwald69652032004-09-07 20:24:22 +00004200 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4201}
4202
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004203#include "stringlib/ucs1lib.h"
4204#include "stringlib/codecs.h"
4205#include "stringlib/undef.h"
4206
4207#include "stringlib/ucs2lib.h"
4208#include "stringlib/codecs.h"
4209#include "stringlib/undef.h"
4210
4211#include "stringlib/ucs4lib.h"
4212#include "stringlib/codecs.h"
4213#include "stringlib/undef.h"
4214
Antoine Pitrouab868312009-01-10 15:40:25 +00004215/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4216#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4217
4218/* Mask to quickly check whether a C 'long' contains a
4219 non-ASCII, UTF8-encoded char. */
4220#if (SIZEOF_LONG == 8)
4221# define ASCII_CHAR_MASK 0x8080808080808080L
4222#elif (SIZEOF_LONG == 4)
4223# define ASCII_CHAR_MASK 0x80808080L
4224#else
4225# error C 'long' size should be either 4 or 8!
4226#endif
4227
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004228/* Scans a UTF-8 string and returns the maximum character to be expected
4229 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004230
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004231 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004232 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233 */
4234static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004235utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4236 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004237{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004239 const unsigned char *p = (const unsigned char *)s;
4240 const unsigned char *end = p + string_size;
4241 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004243 assert(unicode_size != NULL);
4244
4245 /* By having a cascade of independent loops which fallback onto each
4246 other, we minimize the amount of work done in the average loop
4247 iteration, and we also maximize the CPU's ability to predict
4248 branches correctly (because a given condition will have always the
4249 same boolean outcome except perhaps in the last iteration of the
4250 corresponding loop).
4251 In the general case this brings us rather close to decoding
4252 performance pre-PEP 393, despite the two-pass decoding.
4253
4254 Note that the pure ASCII loop is not duplicated once a non-ASCII
4255 character has been encountered. It is actually a pessimization (by
4256 a significant factor) to use this loop on text with many non-ASCII
4257 characters, and it is important to avoid bad performance on valid
4258 utf-8 data (invalid utf-8 being a different can of worms).
4259 */
4260
4261 /* ASCII */
4262 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 /* Only check value if it's not a ASCII char... */
4264 if (*p < 0x80) {
4265 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4266 an explanation. */
4267 if (!((size_t) p & LONG_PTR_MASK)) {
4268 /* Help register allocation */
4269 register const unsigned char *_p = p;
4270 while (_p < aligned_end) {
4271 unsigned long value = *(unsigned long *) _p;
4272 if (value & ASCII_CHAR_MASK)
4273 break;
4274 _p += SIZEOF_LONG;
4275 char_count += SIZEOF_LONG;
4276 }
4277 p = _p;
4278 if (p == end)
4279 break;
4280 }
4281 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004282 if (*p < 0x80)
4283 ++char_count;
4284 else
4285 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004287 *unicode_size = char_count;
4288 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004289
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004290_ucs1loop:
4291 for (; p < end; ++p) {
4292 if (*p < 0xc4)
4293 char_count += ((*p & 0xc0) != 0x80);
4294 else
4295 goto _ucs2loop;
4296 }
4297 *unicode_size = char_count;
4298 return 255;
4299
4300_ucs2loop:
4301 for (; p < end; ++p) {
4302 if (*p < 0xf0)
4303 char_count += ((*p & 0xc0) != 0x80);
4304 else
4305 goto _ucs4loop;
4306 }
4307 *unicode_size = char_count;
4308 return 65535;
4309
4310_ucs4loop:
4311 for (; p < end; ++p) {
4312 char_count += ((*p & 0xc0) != 0x80);
4313 }
4314 *unicode_size = char_count;
4315 return 65537;
4316}
4317
4318/* Called when we encountered some error that wasn't detected in the original
4319 scan, e.g. an encoded surrogate character. The original maxchar computation
4320 may have been incorrect, so redo it. */
4321static int
4322refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4323{
4324 PyObject *tmp;
Victor Stinnerf8facac2011-11-22 02:30:47 +01004325 Py_ssize_t k;
4326 Py_UCS4 maxchar;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004327 for (k = 0, maxchar = 0; k < n; k++)
4328 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4329 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4330 if (tmp == NULL)
4331 return -1;
4332 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4333 Py_DECREF(*unicode);
4334 *unicode = tmp;
4335 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004336}
4337
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004338/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4339 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4340 onError. Potential resizing overallocates, so the result needs to shrink
4341 at the end.
4342*/
4343#define WRITE_MAYBE_FAIL(index, value) \
4344 do { \
4345 if (has_errors) { \
4346 Py_ssize_t pos = index; \
4347 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4348 unicode_resize(&unicode, pos + pos/8) < 0) \
4349 goto onError; \
4350 if (unicode_putchar(&unicode, &pos, value) < 0) \
4351 goto onError; \
4352 } \
4353 else \
4354 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004355 } while (0)
4356
Alexander Belopolsky40018472011-02-26 01:02:56 +00004357PyObject *
4358PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004359 Py_ssize_t size,
4360 const char *errors,
4361 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004362{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004363 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004365 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004366 Py_ssize_t startinpos;
4367 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004368 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004369 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004370 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 PyObject *errorHandler = NULL;
4372 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004373 Py_UCS4 maxchar = 0;
4374 Py_ssize_t unicode_size;
4375 Py_ssize_t i;
4376 int kind;
4377 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004378 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379
Walter Dörwald69652032004-09-07 20:24:22 +00004380 if (size == 0) {
4381 if (consumed)
4382 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004383 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004384 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004385 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004386 /* When the string is ASCII only, just use memcpy and return.
4387 unicode_size may be != size if there is an incomplete UTF-8
4388 sequence at the end of the ASCII block. */
4389 if (maxchar < 128 && size == unicode_size) {
Victor Stinner42885202011-11-22 01:23:02 +01004390 if (consumed)
4391 *consumed = size;
4392
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004393 if (size == 1)
4394 return get_latin1_char((unsigned char)s[0]);
4395
4396 unicode = PyUnicode_New(unicode_size, maxchar);
4397 if (!unicode)
4398 return NULL;
4399 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4400 assert(_PyUnicode_CheckConsistency(unicode, 1));
4401 return unicode;
4402 }
4403
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004404 /* In case of errors, maxchar and size computation might be incorrect;
4405 code below refits and resizes as necessary. */
4406 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004407 if (!unicode)
4408 return NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004409 kind = PyUnicode_KIND(unicode);
4410 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004411
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004413 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004415 switch (kind) {
4416 case PyUnicode_1BYTE_KIND:
4417 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4418 break;
4419 case PyUnicode_2BYTE_KIND:
4420 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4421 break;
4422 case PyUnicode_4BYTE_KIND:
4423 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4424 break;
4425 }
4426 if (!has_errors) {
4427 /* Ensure the unicode size calculation was correct */
4428 assert(i == unicode_size);
4429 assert(s == e);
4430 if (consumed)
4431 *consumed = s-starts;
4432 return unicode;
4433 }
4434 /* Fall through to the generic decoding loop for the rest of
4435 the string */
4436 if (refit_partial_string(&unicode, kind, data, i) < 0)
4437 goto onError;
4438
Antoine Pitrouab868312009-01-10 15:40:25 +00004439 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440
4441 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004442 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443
4444 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004445 /* Fast path for runs of ASCII characters. Given that common UTF-8
4446 input will consist of an overwhelming majority of ASCII
4447 characters, we try to optimize for this case by checking
4448 as many characters as a C 'long' can contain.
4449 First, check if we can do an aligned read, as most CPUs have
4450 a penalty for unaligned reads.
4451 */
4452 if (!((size_t) s & LONG_PTR_MASK)) {
4453 /* Help register allocation */
4454 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004455 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004456 while (_s < aligned_end) {
4457 /* Read a whole long at a time (either 4 or 8 bytes),
4458 and do a fast unrolled copy if it only contains ASCII
4459 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004460 unsigned long value = *(unsigned long *) _s;
4461 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004462 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004463 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4464 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4465 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4466 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004467#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004468 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4469 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4470 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4471 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004472#endif
4473 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004474 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004475 }
4476 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004477 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004478 if (s == e)
4479 break;
4480 ch = (unsigned char)*s;
4481 }
4482 }
4483
4484 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004485 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004486 s++;
4487 continue;
4488 }
4489
4490 n = utf8_code_length[ch];
4491
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004492 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 if (consumed)
4494 break;
4495 else {
4496 errmsg = "unexpected end of data";
4497 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004498 endinpos = startinpos+1;
4499 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4500 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 goto utf8Error;
4502 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004503 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504
4505 switch (n) {
4506
4507 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004508 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 startinpos = s-starts;
4510 endinpos = startinpos+1;
4511 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512
4513 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004514 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 startinpos = s-starts;
4516 endinpos = startinpos+1;
4517 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518
4519 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004520 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004521 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004523 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 goto utf8Error;
4525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004527 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004528 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529 break;
4530
4531 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004532 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4533 will result in surrogates in range d800-dfff. Surrogates are
4534 not valid UTF-8 so they are rejected.
4535 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4536 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004537 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004538 (s[2] & 0xc0) != 0x80 ||
4539 ((unsigned char)s[0] == 0xE0 &&
4540 (unsigned char)s[1] < 0xA0) ||
4541 ((unsigned char)s[0] == 0xED &&
4542 (unsigned char)s[1] > 0x9F)) {
4543 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004545 endinpos = startinpos + 1;
4546
4547 /* if s[1] first two bits are 1 and 0, then the invalid
4548 continuation byte is s[2], so increment endinpos by 1,
4549 if not, s[1] is invalid and endinpos doesn't need to
4550 be incremented. */
4551 if ((s[1] & 0xC0) == 0x80)
4552 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 goto utf8Error;
4554 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004556 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004557 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004558 break;
4559
4560 case 4:
4561 if ((s[1] & 0xc0) != 0x80 ||
4562 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004563 (s[3] & 0xc0) != 0x80 ||
4564 ((unsigned char)s[0] == 0xF0 &&
4565 (unsigned char)s[1] < 0x90) ||
4566 ((unsigned char)s[0] == 0xF4 &&
4567 (unsigned char)s[1] > 0x8F)) {
4568 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004569 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004570 endinpos = startinpos + 1;
4571 if ((s[1] & 0xC0) == 0x80) {
4572 endinpos++;
4573 if ((s[2] & 0xC0) == 0x80)
4574 endinpos++;
4575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 goto utf8Error;
4577 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004578 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004579 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4580 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4581
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004582 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584 }
4585 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004586 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004587
Benjamin Peterson29060642009-01-31 22:14:21 +00004588 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004589 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004590 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004591 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004592 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004593 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 if (unicode_decode_call_errorhandler(
4595 errors, &errorHandler,
4596 "utf8", errmsg,
4597 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004598 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004600 /* Update data because unicode_decode_call_errorhandler might have
4601 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004602 data = PyUnicode_DATA(unicode);
4603 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004604 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004606 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004607 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004608
Walter Dörwald69652032004-09-07 20:24:22 +00004609 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004612 /* Adjust length and ready string when it contained errors and
4613 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004614 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004615 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616 goto onError;
4617 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004619 Py_XDECREF(errorHandler);
4620 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004621 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004622 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623
Benjamin Peterson29060642009-01-31 22:14:21 +00004624 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004625 Py_XDECREF(errorHandler);
4626 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 Py_DECREF(unicode);
4628 return NULL;
4629}
4630
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004631#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004632
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004633#ifdef __APPLE__
4634
4635/* Simplified UTF-8 decoder using surrogateescape error handler,
4636 used to decode the command line arguments on Mac OS X. */
4637
4638wchar_t*
4639_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4640{
4641 int n;
4642 const char *e;
4643 wchar_t *unicode, *p;
4644
4645 /* Note: size will always be longer than the resulting Unicode
4646 character count */
4647 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4648 PyErr_NoMemory();
4649 return NULL;
4650 }
4651 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4652 if (!unicode)
4653 return NULL;
4654
4655 /* Unpack UTF-8 encoded data */
4656 p = unicode;
4657 e = s + size;
4658 while (s < e) {
4659 Py_UCS4 ch = (unsigned char)*s;
4660
4661 if (ch < 0x80) {
4662 *p++ = (wchar_t)ch;
4663 s++;
4664 continue;
4665 }
4666
4667 n = utf8_code_length[ch];
4668 if (s + n > e) {
4669 goto surrogateescape;
4670 }
4671
4672 switch (n) {
4673 case 0:
4674 case 1:
4675 goto surrogateescape;
4676
4677 case 2:
4678 if ((s[1] & 0xc0) != 0x80)
4679 goto surrogateescape;
4680 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4681 assert ((ch > 0x007F) && (ch <= 0x07FF));
4682 *p++ = (wchar_t)ch;
4683 break;
4684
4685 case 3:
4686 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4687 will result in surrogates in range d800-dfff. Surrogates are
4688 not valid UTF-8 so they are rejected.
4689 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4690 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4691 if ((s[1] & 0xc0) != 0x80 ||
4692 (s[2] & 0xc0) != 0x80 ||
4693 ((unsigned char)s[0] == 0xE0 &&
4694 (unsigned char)s[1] < 0xA0) ||
4695 ((unsigned char)s[0] == 0xED &&
4696 (unsigned char)s[1] > 0x9F)) {
4697
4698 goto surrogateescape;
4699 }
4700 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4701 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004702 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004703 break;
4704
4705 case 4:
4706 if ((s[1] & 0xc0) != 0x80 ||
4707 (s[2] & 0xc0) != 0x80 ||
4708 (s[3] & 0xc0) != 0x80 ||
4709 ((unsigned char)s[0] == 0xF0 &&
4710 (unsigned char)s[1] < 0x90) ||
4711 ((unsigned char)s[0] == 0xF4 &&
4712 (unsigned char)s[1] > 0x8F)) {
4713 goto surrogateescape;
4714 }
4715 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4716 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4717 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4718
4719#if SIZEOF_WCHAR_T == 4
4720 *p++ = (wchar_t)ch;
4721#else
4722 /* compute and append the two surrogates: */
4723
4724 /* translate from 10000..10FFFF to 0..FFFF */
4725 ch -= 0x10000;
4726
4727 /* high surrogate = top 10 bits added to D800 */
4728 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4729
4730 /* low surrogate = bottom 10 bits added to DC00 */
4731 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4732#endif
4733 break;
4734 }
4735 s += n;
4736 continue;
4737
4738 surrogateescape:
4739 *p++ = 0xDC00 + ch;
4740 s++;
4741 }
4742 *p = L'\0';
4743 return unicode;
4744}
4745
4746#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004748/* Primary internal function which creates utf8 encoded bytes objects.
4749
4750 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004751 and allocate exactly as much space needed at the end. Else allocate the
4752 maximum possible needed (4 result bytes per Unicode character), and return
4753 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004754*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004755PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004756_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757{
Tim Peters602f7402002-04-27 18:03:26 +00004758#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004759
Guido van Rossum98297ee2007-11-06 21:34:58 +00004760 Py_ssize_t i; /* index into s of next input byte */
4761 PyObject *result; /* result string object */
4762 char *p; /* next free byte in output buffer */
4763 Py_ssize_t nallocated; /* number of result bytes allocated */
4764 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004765 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004766 PyObject *errorHandler = NULL;
4767 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004768 int kind;
4769 void *data;
4770 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004771 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004773 if (!PyUnicode_Check(unicode)) {
4774 PyErr_BadArgument();
4775 return NULL;
4776 }
4777
4778 if (PyUnicode_READY(unicode) == -1)
4779 return NULL;
4780
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004781 if (PyUnicode_UTF8(unicode))
4782 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4783 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004784
4785 kind = PyUnicode_KIND(unicode);
4786 data = PyUnicode_DATA(unicode);
4787 size = PyUnicode_GET_LENGTH(unicode);
4788
Tim Peters602f7402002-04-27 18:03:26 +00004789 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790
Tim Peters602f7402002-04-27 18:03:26 +00004791 if (size <= MAX_SHORT_UNICHARS) {
4792 /* Write into the stack buffer; nallocated can't overflow.
4793 * At the end, we'll allocate exactly as much heap space as it
4794 * turns out we need.
4795 */
4796 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004797 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004798 p = stackbuf;
4799 }
4800 else {
4801 /* Overallocate on the heap, and give the excess back at the end. */
4802 nallocated = size * 4;
4803 if (nallocated / 4 != size) /* overflow! */
4804 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004805 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004806 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004807 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004808 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004809 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004810
Tim Peters602f7402002-04-27 18:03:26 +00004811 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004812 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004813
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004814 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004815 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004817
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004819 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004820 *p++ = (char)(0xc0 | (ch >> 6));
4821 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004822 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004823 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004824 Py_ssize_t repsize, k, startpos;
4825 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004826 rep = unicode_encode_call_errorhandler(
4827 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004828 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004829 if (!rep)
4830 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004832 if (PyBytes_Check(rep))
4833 repsize = PyBytes_GET_SIZE(rep);
4834 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004835 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004836
4837 if (repsize > 4) {
4838 Py_ssize_t offset;
4839
4840 if (result == NULL)
4841 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004842 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004843 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004845 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4846 /* integer overflow */
4847 PyErr_NoMemory();
4848 goto error;
4849 }
4850 nallocated += repsize - 4;
4851 if (result != NULL) {
4852 if (_PyBytes_Resize(&result, nallocated) < 0)
4853 goto error;
4854 } else {
4855 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004856 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004857 goto error;
4858 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4859 }
4860 p = PyBytes_AS_STRING(result) + offset;
4861 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004863 if (PyBytes_Check(rep)) {
4864 char *prep = PyBytes_AS_STRING(rep);
4865 for(k = repsize; k > 0; k--)
4866 *p++ = *prep++;
4867 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004868 enum PyUnicode_Kind repkind;
4869 void *repdata;
4870
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004871 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004872 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004873 repkind = PyUnicode_KIND(rep);
4874 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004875
4876 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004877 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004878 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004879 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004880 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004881 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004883 goto error;
4884 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004885 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004886 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004887 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004888 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004889 } else if (ch < 0x10000) {
4890 *p++ = (char)(0xe0 | (ch >> 12));
4891 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4892 *p++ = (char)(0x80 | (ch & 0x3f));
4893 } else /* ch >= 0x10000 */ {
Victor Stinner0d3721d2011-11-22 03:27:53 +01004894 assert(ch <= 0x10FFFF);
Tim Peters602f7402002-04-27 18:03:26 +00004895 /* Encode UCS4 Unicode ordinals */
4896 *p++ = (char)(0xf0 | (ch >> 18));
4897 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4898 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4899 *p++ = (char)(0x80 | (ch & 0x3f));
4900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004902
Guido van Rossum98297ee2007-11-06 21:34:58 +00004903 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004904 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004905 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004906 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004907 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004908 }
4909 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004910 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004911 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004912 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004913 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004914 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004915
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004916 Py_XDECREF(errorHandler);
4917 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004918 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004919 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004920 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004921 Py_XDECREF(errorHandler);
4922 Py_XDECREF(exc);
4923 Py_XDECREF(result);
4924 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004925
Tim Peters602f7402002-04-27 18:03:26 +00004926#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927}
4928
Alexander Belopolsky40018472011-02-26 01:02:56 +00004929PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004930PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4931 Py_ssize_t size,
4932 const char *errors)
4933{
4934 PyObject *v, *unicode;
4935
4936 unicode = PyUnicode_FromUnicode(s, size);
4937 if (unicode == NULL)
4938 return NULL;
4939 v = _PyUnicode_AsUTF8String(unicode, errors);
4940 Py_DECREF(unicode);
4941 return v;
4942}
4943
4944PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004945PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004947 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948}
4949
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950/* --- UTF-32 Codec ------------------------------------------------------- */
4951
4952PyObject *
4953PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 Py_ssize_t size,
4955 const char *errors,
4956 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957{
4958 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4959}
4960
4961PyObject *
4962PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 Py_ssize_t size,
4964 const char *errors,
4965 int *byteorder,
4966 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004967{
4968 const char *starts = s;
4969 Py_ssize_t startinpos;
4970 Py_ssize_t endinpos;
4971 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004972 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004973 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004974 int bo = 0; /* assume native ordering by default */
4975 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004976 /* Offsets from q for retrieving bytes in the right order. */
4977#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4978 int iorder[] = {0, 1, 2, 3};
4979#else
4980 int iorder[] = {3, 2, 1, 0};
4981#endif
4982 PyObject *errorHandler = NULL;
4983 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004984
Walter Dörwald41980ca2007-08-16 21:55:45 +00004985 q = (unsigned char *)s;
4986 e = q + size;
4987
4988 if (byteorder)
4989 bo = *byteorder;
4990
4991 /* Check for BOM marks (U+FEFF) in the input and adjust current
4992 byte order setting accordingly. In native mode, the leading BOM
4993 mark is skipped, in all other modes, it is copied to the output
4994 stream as-is (giving a ZWNBSP character). */
4995 if (bo == 0) {
4996 if (size >= 4) {
4997 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004999#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005000 if (bom == 0x0000FEFF) {
5001 q += 4;
5002 bo = -1;
5003 }
5004 else if (bom == 0xFFFE0000) {
5005 q += 4;
5006 bo = 1;
5007 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 if (bom == 0x0000FEFF) {
5010 q += 4;
5011 bo = 1;
5012 }
5013 else if (bom == 0xFFFE0000) {
5014 q += 4;
5015 bo = -1;
5016 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005017#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005019 }
5020
5021 if (bo == -1) {
5022 /* force LE */
5023 iorder[0] = 0;
5024 iorder[1] = 1;
5025 iorder[2] = 2;
5026 iorder[3] = 3;
5027 }
5028 else if (bo == 1) {
5029 /* force BE */
5030 iorder[0] = 3;
5031 iorder[1] = 2;
5032 iorder[2] = 1;
5033 iorder[3] = 0;
5034 }
5035
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005036 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005037 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005038 if (!unicode)
5039 return NULL;
5040 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005041 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005042 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005043
Walter Dörwald41980ca2007-08-16 21:55:45 +00005044 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005045 Py_UCS4 ch;
5046 /* remaining bytes at the end? (size should be divisible by 4) */
5047 if (e-q<4) {
5048 if (consumed)
5049 break;
5050 errmsg = "truncated data";
5051 startinpos = ((const char *)q)-starts;
5052 endinpos = ((const char *)e)-starts;
5053 goto utf32Error;
5054 /* The remaining input chars are ignored if the callback
5055 chooses to skip the input */
5056 }
5057 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5058 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 if (ch >= 0x110000)
5061 {
5062 errmsg = "codepoint not in range(0x110000)";
5063 startinpos = ((const char *)q)-starts;
5064 endinpos = startinpos+4;
5065 goto utf32Error;
5066 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005067 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5068 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 q += 4;
5070 continue;
5071 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 if (unicode_decode_call_errorhandler(
5073 errors, &errorHandler,
5074 "utf32", errmsg,
5075 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005076 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005077 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005078 }
5079
5080 if (byteorder)
5081 *byteorder = bo;
5082
5083 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005085
5086 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005087 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005088 goto onError;
5089
5090 Py_XDECREF(errorHandler);
5091 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005092 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093
Benjamin Peterson29060642009-01-31 22:14:21 +00005094 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005095 Py_DECREF(unicode);
5096 Py_XDECREF(errorHandler);
5097 Py_XDECREF(exc);
5098 return NULL;
5099}
5100
5101PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005102_PyUnicode_EncodeUTF32(PyObject *str,
5103 const char *errors,
5104 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005106 int kind;
5107 void *data;
5108 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005109 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005111 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112 /* Offsets from p for storing byte pairs in the right order. */
5113#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5114 int iorder[] = {0, 1, 2, 3};
5115#else
5116 int iorder[] = {3, 2, 1, 0};
5117#endif
5118
Benjamin Peterson29060642009-01-31 22:14:21 +00005119#define STORECHAR(CH) \
5120 do { \
5121 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5122 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5123 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5124 p[iorder[0]] = (CH) & 0xff; \
5125 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005126 } while(0)
5127
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005128 if (!PyUnicode_Check(str)) {
5129 PyErr_BadArgument();
5130 return NULL;
5131 }
5132 if (PyUnicode_READY(str) < 0)
5133 return NULL;
5134 kind = PyUnicode_KIND(str);
5135 data = PyUnicode_DATA(str);
5136 len = PyUnicode_GET_LENGTH(str);
5137
5138 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005139 bytesize = nsize * 4;
5140 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005141 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005142 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005143 if (v == NULL)
5144 return NULL;
5145
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005146 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005147 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005149 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005150 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005151
5152 if (byteorder == -1) {
5153 /* force LE */
5154 iorder[0] = 0;
5155 iorder[1] = 1;
5156 iorder[2] = 2;
5157 iorder[3] = 3;
5158 }
5159 else if (byteorder == 1) {
5160 /* force BE */
5161 iorder[0] = 3;
5162 iorder[1] = 2;
5163 iorder[2] = 1;
5164 iorder[3] = 0;
5165 }
5166
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005167 for (i = 0; i < len; i++)
5168 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005169
5170 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005171 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005172#undef STORECHAR
5173}
5174
Alexander Belopolsky40018472011-02-26 01:02:56 +00005175PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005176PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5177 Py_ssize_t size,
5178 const char *errors,
5179 int byteorder)
5180{
5181 PyObject *result;
5182 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5183 if (tmp == NULL)
5184 return NULL;
5185 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5186 Py_DECREF(tmp);
5187 return result;
5188}
5189
5190PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005191PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005192{
Victor Stinnerb960b342011-11-20 19:12:52 +01005193 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005194}
5195
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196/* --- UTF-16 Codec ------------------------------------------------------- */
5197
Tim Peters772747b2001-08-09 22:21:55 +00005198PyObject *
5199PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 Py_ssize_t size,
5201 const char *errors,
5202 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203{
Walter Dörwald69652032004-09-07 20:24:22 +00005204 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5205}
5206
Antoine Pitrouab868312009-01-10 15:40:25 +00005207/* Two masks for fast checking of whether a C 'long' may contain
5208 UTF16-encoded surrogate characters. This is an efficient heuristic,
5209 assuming that non-surrogate characters with a code point >= 0x8000 are
5210 rare in most input.
5211 FAST_CHAR_MASK is used when the input is in native byte ordering,
5212 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005213*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005214#if (SIZEOF_LONG == 8)
5215# define FAST_CHAR_MASK 0x8000800080008000L
5216# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5217#elif (SIZEOF_LONG == 4)
5218# define FAST_CHAR_MASK 0x80008000L
5219# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5220#else
5221# error C 'long' size should be either 4 or 8!
5222#endif
5223
Walter Dörwald69652032004-09-07 20:24:22 +00005224PyObject *
5225PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005226 Py_ssize_t size,
5227 const char *errors,
5228 int *byteorder,
5229 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005230{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005231 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005232 Py_ssize_t startinpos;
5233 Py_ssize_t endinpos;
5234 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005235 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005236 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005237 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005238 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005239 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005240 /* Offsets from q for retrieving byte pairs in the right order. */
5241#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5242 int ihi = 1, ilo = 0;
5243#else
5244 int ihi = 0, ilo = 1;
5245#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 PyObject *errorHandler = NULL;
5247 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
5249 /* Note: size will always be longer than the resulting Unicode
5250 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005251 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252 if (!unicode)
5253 return NULL;
5254 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005255 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005256 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257
Tim Peters772747b2001-08-09 22:21:55 +00005258 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005259 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
5261 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005262 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005264 /* Check for BOM marks (U+FEFF) in the input and adjust current
5265 byte order setting accordingly. In native mode, the leading BOM
5266 mark is skipped, in all other modes, it is copied to the output
5267 stream as-is (giving a ZWNBSP character). */
5268 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005269 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005270 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005271#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005272 if (bom == 0xFEFF) {
5273 q += 2;
5274 bo = -1;
5275 }
5276 else if (bom == 0xFFFE) {
5277 q += 2;
5278 bo = 1;
5279 }
Tim Petersced69f82003-09-16 20:30:58 +00005280#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005281 if (bom == 0xFEFF) {
5282 q += 2;
5283 bo = 1;
5284 }
5285 else if (bom == 0xFFFE) {
5286 q += 2;
5287 bo = -1;
5288 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005289#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005290 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005291 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
Tim Peters772747b2001-08-09 22:21:55 +00005293 if (bo == -1) {
5294 /* force LE */
5295 ihi = 1;
5296 ilo = 0;
5297 }
5298 else if (bo == 1) {
5299 /* force BE */
5300 ihi = 0;
5301 ilo = 1;
5302 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005303#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5304 native_ordering = ilo < ihi;
5305#else
5306 native_ordering = ilo > ihi;
5307#endif
Tim Peters772747b2001-08-09 22:21:55 +00005308
Antoine Pitrouab868312009-01-10 15:40:25 +00005309 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005310 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005311 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005312 /* First check for possible aligned read of a C 'long'. Unaligned
5313 reads are more expensive, better to defer to another iteration. */
5314 if (!((size_t) q & LONG_PTR_MASK)) {
5315 /* Fast path for runs of non-surrogate chars. */
5316 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005317 int kind = PyUnicode_KIND(unicode);
5318 void *data = PyUnicode_DATA(unicode);
5319 while (_q < aligned_end) {
5320 unsigned long block = * (unsigned long *) _q;
5321 unsigned short *pblock = (unsigned short*)&block;
5322 Py_UCS4 maxch;
5323 if (native_ordering) {
5324 /* Can use buffer directly */
5325 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005326 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005327 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005328 else {
5329 /* Need to byte-swap */
5330 unsigned char *_p = (unsigned char*)pblock;
5331 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005332 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005333 _p[0] = _q[1];
5334 _p[1] = _q[0];
5335 _p[2] = _q[3];
5336 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005337#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005338 _p[4] = _q[5];
5339 _p[5] = _q[4];
5340 _p[6] = _q[7];
5341 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005342#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005343 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005344 maxch = Py_MAX(pblock[0], pblock[1]);
5345#if SIZEOF_LONG == 8
5346 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5347#endif
5348 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5349 if (unicode_widen(&unicode, maxch) < 0)
5350 goto onError;
5351 kind = PyUnicode_KIND(unicode);
5352 data = PyUnicode_DATA(unicode);
5353 }
5354 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5355 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5356#if SIZEOF_LONG == 8
5357 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5358 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5359#endif
5360 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005361 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005362 q = _q;
5363 if (q >= e)
5364 break;
5365 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005366 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005367
Benjamin Peterson14339b62009-01-31 16:36:08 +00005368 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005369
5370 if (ch < 0xD800 || ch > 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005371 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5372 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 continue;
5374 }
5375
5376 /* UTF-16 code pair: */
5377 if (q > e) {
5378 errmsg = "unexpected end of data";
5379 startinpos = (((const char *)q) - 2) - starts;
5380 endinpos = ((const char *)e) + 1 - starts;
5381 goto utf16Error;
5382 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005383 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5384 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005386 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005387 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005388 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005389 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 continue;
5391 }
5392 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005393 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005394 startinpos = (((const char *)q)-4)-starts;
5395 endinpos = startinpos+2;
5396 goto utf16Error;
5397 }
5398
Benjamin Peterson14339b62009-01-31 16:36:08 +00005399 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 errmsg = "illegal encoding";
5401 startinpos = (((const char *)q)-2)-starts;
5402 endinpos = startinpos+2;
5403 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005404
Benjamin Peterson29060642009-01-31 22:14:21 +00005405 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005407 errors,
5408 &errorHandler,
5409 "utf16", errmsg,
5410 &starts,
5411 (const char **)&e,
5412 &startinpos,
5413 &endinpos,
5414 &exc,
5415 (const char **)&q,
5416 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005417 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005420 /* remaining byte at the end? (size should be even) */
5421 if (e == q) {
5422 if (!consumed) {
5423 errmsg = "truncated data";
5424 startinpos = ((const char *)q) - starts;
5425 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005426 if (unicode_decode_call_errorhandler(
5427 errors,
5428 &errorHandler,
5429 "utf16", errmsg,
5430 &starts,
5431 (const char **)&e,
5432 &startinpos,
5433 &endinpos,
5434 &exc,
5435 (const char **)&q,
5436 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005437 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005438 goto onError;
5439 /* The remaining input chars are ignored if the callback
5440 chooses to skip the input */
5441 }
5442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443
5444 if (byteorder)
5445 *byteorder = bo;
5446
Walter Dörwald69652032004-09-07 20:24:22 +00005447 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005449
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005451 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 goto onError;
5453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005454 Py_XDECREF(errorHandler);
5455 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005456 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457
Benjamin Peterson29060642009-01-31 22:14:21 +00005458 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005460 Py_XDECREF(errorHandler);
5461 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 return NULL;
5463}
5464
Antoine Pitrouab868312009-01-10 15:40:25 +00005465#undef FAST_CHAR_MASK
5466#undef SWAPPED_FAST_CHAR_MASK
5467
Tim Peters772747b2001-08-09 22:21:55 +00005468PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005469_PyUnicode_EncodeUTF16(PyObject *str,
5470 const char *errors,
5471 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005473 int kind;
5474 void *data;
5475 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005476 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005477 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005478 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005479 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005480 /* Offsets from p for storing byte pairs in the right order. */
5481#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5482 int ihi = 1, ilo = 0;
5483#else
5484 int ihi = 0, ilo = 1;
5485#endif
5486
Benjamin Peterson29060642009-01-31 22:14:21 +00005487#define STORECHAR(CH) \
5488 do { \
5489 p[ihi] = ((CH) >> 8) & 0xff; \
5490 p[ilo] = (CH) & 0xff; \
5491 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005492 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005494 if (!PyUnicode_Check(str)) {
5495 PyErr_BadArgument();
5496 return NULL;
5497 }
5498 if (PyUnicode_READY(str) < 0)
5499 return NULL;
5500 kind = PyUnicode_KIND(str);
5501 data = PyUnicode_DATA(str);
5502 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005503
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005504 pairs = 0;
5505 if (kind == PyUnicode_4BYTE_KIND)
5506 for (i = 0; i < len; i++)
5507 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5508 pairs++;
5509 /* 2 * (len + pairs + (byteorder == 0)) */
5510 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005512 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005513 bytesize = nsize * 2;
5514 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005516 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 if (v == NULL)
5518 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005520 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005523 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005524 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005525
5526 if (byteorder == -1) {
5527 /* force LE */
5528 ihi = 1;
5529 ilo = 0;
5530 }
5531 else if (byteorder == 1) {
5532 /* force BE */
5533 ihi = 0;
5534 ilo = 1;
5535 }
5536
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005537 for (i = 0; i < len; i++) {
5538 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5539 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005540 if (ch >= 0x10000) {
5541 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5542 ch = 0xD800 | ((ch-0x10000) >> 10);
5543 }
Tim Peters772747b2001-08-09 22:21:55 +00005544 STORECHAR(ch);
5545 if (ch2)
5546 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005547 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005548
5549 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005550 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005551#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552}
5553
Alexander Belopolsky40018472011-02-26 01:02:56 +00005554PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005555PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5556 Py_ssize_t size,
5557 const char *errors,
5558 int byteorder)
5559{
5560 PyObject *result;
5561 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5562 if (tmp == NULL)
5563 return NULL;
5564 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5565 Py_DECREF(tmp);
5566 return result;
5567}
5568
5569PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005570PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005572 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573}
5574
5575/* --- Unicode Escape Codec ----------------------------------------------- */
5576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5578 if all the escapes in the string make it still a valid ASCII string.
5579 Returns -1 if any escapes were found which cause the string to
5580 pop out of ASCII range. Otherwise returns the length of the
5581 required buffer to hold the string.
5582 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005583static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005584length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5585{
5586 const unsigned char *p = (const unsigned char *)s;
5587 const unsigned char *end = p + size;
5588 Py_ssize_t length = 0;
5589
5590 if (size < 0)
5591 return -1;
5592
5593 for (; p < end; ++p) {
5594 if (*p > 127) {
5595 /* Non-ASCII */
5596 return -1;
5597 }
5598 else if (*p != '\\') {
5599 /* Normal character */
5600 ++length;
5601 }
5602 else {
5603 /* Backslash-escape, check next char */
5604 ++p;
5605 /* Escape sequence reaches till end of string or
5606 non-ASCII follow-up. */
5607 if (p >= end || *p > 127)
5608 return -1;
5609 switch (*p) {
5610 case '\n':
5611 /* backslash + \n result in zero characters */
5612 break;
5613 case '\\': case '\'': case '\"':
5614 case 'b': case 'f': case 't':
5615 case 'n': case 'r': case 'v': case 'a':
5616 ++length;
5617 break;
5618 case '0': case '1': case '2': case '3':
5619 case '4': case '5': case '6': case '7':
5620 case 'x': case 'u': case 'U': case 'N':
5621 /* these do not guarantee ASCII characters */
5622 return -1;
5623 default:
5624 /* count the backslash + the other character */
5625 length += 2;
5626 }
5627 }
5628 }
5629 return length;
5630}
5631
Fredrik Lundh06d12682001-01-24 07:59:11 +00005632static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005633
Alexander Belopolsky40018472011-02-26 01:02:56 +00005634PyObject *
5635PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005636 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005637 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005640 Py_ssize_t startinpos;
5641 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005643 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005645 char* message;
5646 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 PyObject *errorHandler = NULL;
5648 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005649 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005650 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005651
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005652 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005653
5654 /* After length_of_escaped_ascii_string() there are two alternatives,
5655 either the string is pure ASCII with named escapes like \n, etc.
5656 and we determined it's exact size (common case)
5657 or it contains \x, \u, ... escape sequences. then we create a
5658 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005659 if (len >= 0) {
5660 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005661 if (!v)
5662 goto onError;
5663 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005664 }
5665 else {
5666 /* Escaped strings will always be longer than the resulting
5667 Unicode string, so we start with size here and then reduce the
5668 length after conversion to the true value.
5669 (but if the error callback returns a long replacement string
5670 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005671 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005672 if (!v)
5673 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005674 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675 }
5676
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005678 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005681
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 while (s < end) {
5683 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005684 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005687 /* The only case in which i == ascii_length is a backslash
5688 followed by a newline. */
5689 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005690
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 /* Non-escape characters are interpreted as Unicode ordinals */
5692 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005693 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5694 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 continue;
5696 }
5697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 /* \ - Escapes */
5700 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005701 c = *s++;
5702 if (s > end)
5703 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005704
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005705 /* The only case in which i == ascii_length is a backslash
5706 followed by a newline. */
5707 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005708
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005709 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005712#define WRITECHAR(ch) \
5713 do { \
5714 if (unicode_putchar(&v, &i, ch) < 0) \
5715 goto onError; \
5716 }while(0)
5717
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005719 case '\\': WRITECHAR('\\'); break;
5720 case '\'': WRITECHAR('\''); break;
5721 case '\"': WRITECHAR('\"'); break;
5722 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005724 case 'f': WRITECHAR('\014'); break;
5725 case 't': WRITECHAR('\t'); break;
5726 case 'n': WRITECHAR('\n'); break;
5727 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005728 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005729 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005730 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005731 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 case '0': case '1': case '2': case '3':
5735 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005736 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005737 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005738 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005739 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005740 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005742 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 break;
5744
Benjamin Peterson29060642009-01-31 22:14:21 +00005745 /* hex escapes */
5746 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005748 digits = 2;
5749 message = "truncated \\xXX escape";
5750 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005754 digits = 4;
5755 message = "truncated \\uXXXX escape";
5756 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
Benjamin Peterson29060642009-01-31 22:14:21 +00005758 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005759 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005760 digits = 8;
5761 message = "truncated \\UXXXXXXXX escape";
5762 hexescape:
5763 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 if (s+digits>end) {
5765 endinpos = size;
5766 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 errors, &errorHandler,
5768 "unicodeescape", "end of string in escape sequence",
5769 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005770 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 goto onError;
5772 goto nextByte;
5773 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005774 for (j = 0; j < digits; ++j) {
5775 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005776 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005777 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 errors, &errorHandler,
5780 "unicodeescape", message,
5781 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005782 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005783 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005784 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005785 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005786 }
5787 chr = (chr<<4) & ~0xF;
5788 if (c >= '0' && c <= '9')
5789 chr += c - '0';
5790 else if (c >= 'a' && c <= 'f')
5791 chr += 10 + c - 'a';
5792 else
5793 chr += 10 + c - 'A';
5794 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005796 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 /* _decoding_error will have already written into the
5798 target buffer. */
5799 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005800 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005801 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005802 if (chr <= 0x10ffff) {
5803 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005804 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005807 errors, &errorHandler,
5808 "unicodeescape", "illegal Unicode character",
5809 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005810 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005811 goto onError;
5812 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005813 break;
5814
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005816 case 'N':
5817 message = "malformed \\N character escape";
5818 if (ucnhash_CAPI == NULL) {
5819 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005820 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5821 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005822 if (ucnhash_CAPI == NULL)
5823 goto ucnhashError;
5824 }
5825 if (*s == '{') {
5826 const char *start = s+1;
5827 /* look for the closing brace */
5828 while (*s != '}' && s < end)
5829 s++;
5830 if (s > start && s < end && *s == '}') {
5831 /* found a name. look it up in the unicode database */
5832 message = "unknown Unicode character name";
5833 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005834 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005835 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005836 goto store;
5837 }
5838 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005839 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005840 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 errors, &errorHandler,
5842 "unicodeescape", message,
5843 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005844 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005845 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005846 break;
5847
5848 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005849 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005850 message = "\\ at end of string";
5851 s--;
5852 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 errors, &errorHandler,
5855 "unicodeescape", message,
5856 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005857 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005858 goto onError;
5859 }
5860 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005861 WRITECHAR('\\');
5862 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005863 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005864 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005869#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005870
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005871 if (PyUnicode_Resize(&v, i) < 0)
5872 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005873 Py_XDECREF(errorHandler);
5874 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005875 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005876
Benjamin Peterson29060642009-01-31 22:14:21 +00005877 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005878 PyErr_SetString(
5879 PyExc_UnicodeError,
5880 "\\N escapes not supported (can't load unicodedata module)"
5881 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005882 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 Py_XDECREF(errorHandler);
5884 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005885 return NULL;
5886
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 Py_XDECREF(errorHandler);
5890 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 return NULL;
5892}
5893
5894/* Return a Unicode-Escape string version of the Unicode object.
5895
5896 If quotes is true, the string is enclosed in u"" or u'' quotes as
5897 appropriate.
5898
5899*/
5900
Alexander Belopolsky40018472011-02-26 01:02:56 +00005901PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005905 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005907 int kind;
5908 void *data;
5909 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
Thomas Wouters89f507f2006-12-13 04:49:30 +00005911 /* Initial allocation is based on the longest-possible unichr
5912 escape.
5913
5914 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5915 unichr, so in this case it's the longest unichr escape. In
5916 narrow (UTF-16) builds this is five chars per source unichr
5917 since there are two unichrs in the surrogate pair, so in narrow
5918 (UTF-16) builds it's not the longest unichr escape.
5919
5920 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5921 so in the narrow (UTF-16) build case it's the longest unichr
5922 escape.
5923 */
5924
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005925 if (!PyUnicode_Check(unicode)) {
5926 PyErr_BadArgument();
5927 return NULL;
5928 }
5929 if (PyUnicode_READY(unicode) < 0)
5930 return NULL;
5931 len = PyUnicode_GET_LENGTH(unicode);
5932 kind = PyUnicode_KIND(unicode);
5933 data = PyUnicode_DATA(unicode);
5934 switch(kind) {
5935 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5936 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5937 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5938 }
5939
5940 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005941 return PyBytes_FromStringAndSize(NULL, 0);
5942
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005943 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005945
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005946 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005948 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 if (repr == NULL)
5951 return NULL;
5952
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005953 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005955 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005956 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005957
Walter Dörwald79e913e2007-05-12 11:08:06 +00005958 /* Escape backslashes */
5959 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 *p++ = '\\';
5961 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005962 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005963 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005964
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005965 /* Map 21-bit characters to '\U00xxxxxx' */
5966 else if (ch >= 0x10000) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01005967 assert(ch <= 0x10FFFF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005968 *p++ = '\\';
5969 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005970 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5971 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5972 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5973 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5974 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5975 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5976 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5977 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005979 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005980
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005982 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 *p++ = '\\';
5984 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005985 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5986 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5987 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5988 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005990
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005991 /* Map special whitespace to '\t', \n', '\r' */
5992 else if (ch == '\t') {
5993 *p++ = '\\';
5994 *p++ = 't';
5995 }
5996 else if (ch == '\n') {
5997 *p++ = '\\';
5998 *p++ = 'n';
5999 }
6000 else if (ch == '\r') {
6001 *p++ = '\\';
6002 *p++ = 'r';
6003 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006004
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006005 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006006 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006008 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006009 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6010 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006011 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006012
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 /* Copy everything else as-is */
6014 else
6015 *p++ = (char) ch;
6016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006018 assert(p - PyBytes_AS_STRING(repr) > 0);
6019 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6020 return NULL;
6021 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022}
6023
Alexander Belopolsky40018472011-02-26 01:02:56 +00006024PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006025PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6026 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006027{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006028 PyObject *result;
6029 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6030 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006032 result = PyUnicode_AsUnicodeEscapeString(tmp);
6033 Py_DECREF(tmp);
6034 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035}
6036
6037/* --- Raw Unicode Escape Codec ------------------------------------------- */
6038
Alexander Belopolsky40018472011-02-26 01:02:56 +00006039PyObject *
6040PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006041 Py_ssize_t size,
6042 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006044 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006045 Py_ssize_t startinpos;
6046 Py_ssize_t endinpos;
6047 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006048 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 const char *end;
6050 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006051 PyObject *errorHandler = NULL;
6052 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006053
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 /* Escaped strings will always be longer than the resulting
6055 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 length after conversion to the true value. (But decoding error
6057 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006058 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006062 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006063 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 end = s + size;
6065 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 unsigned char c;
6067 Py_UCS4 x;
6068 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006069 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070
Benjamin Peterson29060642009-01-31 22:14:21 +00006071 /* Non-escape characters are interpreted as Unicode ordinals */
6072 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006073 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6074 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006075 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006076 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 startinpos = s-starts;
6078
6079 /* \u-escapes are only interpreted iff the number of leading
6080 backslashes if odd */
6081 bs = s;
6082 for (;s < end;) {
6083 if (*s != '\\')
6084 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006085 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6086 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 }
6088 if (((s - bs) & 1) == 0 ||
6089 s >= end ||
6090 (*s != 'u' && *s != 'U')) {
6091 continue;
6092 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006093 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 count = *s=='u' ? 4 : 8;
6095 s++;
6096
6097 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 for (x = 0, i = 0; i < count; ++i, ++s) {
6099 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006100 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 endinpos = s-starts;
6102 if (unicode_decode_call_errorhandler(
6103 errors, &errorHandler,
6104 "rawunicodeescape", "truncated \\uXXXX",
6105 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006106 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 goto onError;
6108 goto nextByte;
6109 }
6110 x = (x<<4) & ~0xF;
6111 if (c >= '0' && c <= '9')
6112 x += c - '0';
6113 else if (c >= 'a' && c <= 'f')
6114 x += 10 + c - 'a';
6115 else
6116 x += 10 + c - 'A';
6117 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006118 if (x <= 0x10ffff) {
6119 if (unicode_putchar(&v, &outpos, x) < 0)
6120 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006121 } else {
6122 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006123 if (unicode_decode_call_errorhandler(
6124 errors, &errorHandler,
6125 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006127 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006129 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 nextByte:
6131 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006133 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006135 Py_XDECREF(errorHandler);
6136 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006137 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006138
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006140 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 return NULL;
6144}
6145
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146
Alexander Belopolsky40018472011-02-26 01:02:56 +00006147PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006150 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 char *p;
6152 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006153 Py_ssize_t expandsize, pos;
6154 int kind;
6155 void *data;
6156 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006158 if (!PyUnicode_Check(unicode)) {
6159 PyErr_BadArgument();
6160 return NULL;
6161 }
6162 if (PyUnicode_READY(unicode) < 0)
6163 return NULL;
6164 kind = PyUnicode_KIND(unicode);
6165 data = PyUnicode_DATA(unicode);
6166 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006167
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006168 switch(kind) {
6169 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6170 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6171 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6172 }
Victor Stinner0e368262011-11-10 20:12:49 +01006173
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006176
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 if (repr == NULL)
6179 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006181 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006183 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184 for (pos = 0; pos < len; pos++) {
6185 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 /* Map 32-bit characters to '\Uxxxxxxxx' */
6187 if (ch >= 0x10000) {
Victor Stinner0d3721d2011-11-22 03:27:53 +01006188 assert(ch <= 0x10FFFF);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006189 *p++ = '\\';
6190 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006191 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6192 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6193 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6194 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6195 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6196 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6198 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006199 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006201 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 *p++ = '\\';
6203 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006204 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6205 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6206 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6207 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 /* Copy everything else as-is */
6210 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 *p++ = (char) ch;
6212 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006213
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006214 assert(p > q);
6215 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006216 return NULL;
6217 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218}
6219
Alexander Belopolsky40018472011-02-26 01:02:56 +00006220PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006221PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6222 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224 PyObject *result;
6225 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6226 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006227 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006228 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6229 Py_DECREF(tmp);
6230 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231}
6232
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006233/* --- Unicode Internal Codec ------------------------------------------- */
6234
Alexander Belopolsky40018472011-02-26 01:02:56 +00006235PyObject *
6236_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006237 Py_ssize_t size,
6238 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006239{
6240 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006241 Py_ssize_t startinpos;
6242 Py_ssize_t endinpos;
6243 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006244 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006245 const char *end;
6246 const char *reason;
6247 PyObject *errorHandler = NULL;
6248 PyObject *exc = NULL;
6249
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006250 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006251 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 1))
6253 return NULL;
6254
Thomas Wouters89f507f2006-12-13 04:49:30 +00006255 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006256 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006257 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006259 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006260 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006261 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006262 end = s + size;
6263
6264 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006265 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006266 Py_UCS4 ch;
6267 /* We copy the raw representation one byte at a time because the
6268 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006269 ((char *) &uch)[0] = s[0];
6270 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006271#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006272 ((char *) &uch)[2] = s[2];
6273 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006274#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006275 ch = uch;
6276
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006277 /* We have to sanity check the raw data, otherwise doom looms for
6278 some malformed UCS-4 data. */
6279 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006280#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006281 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006282#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006283 end-s < Py_UNICODE_SIZE
6284 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006286 startinpos = s - starts;
6287 if (end-s < Py_UNICODE_SIZE) {
6288 endinpos = end-starts;
6289 reason = "truncated input";
6290 }
6291 else {
6292 endinpos = s - starts + Py_UNICODE_SIZE;
6293 reason = "illegal code point (> 0x10FFFF)";
6294 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006295 if (unicode_decode_call_errorhandler(
6296 errors, &errorHandler,
6297 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006298 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006299 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006300 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006301 continue;
6302 }
6303
6304 s += Py_UNICODE_SIZE;
6305#ifndef Py_UNICODE_WIDE
6306 if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6307 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006308 Py_UNICODE uch2;
6309 ((char *) &uch2)[0] = s[0];
6310 ((char *) &uch2)[1] = s[1];
6311 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006312 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006313 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006314 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006315 }
6316 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006317#endif
6318
6319 if (unicode_putchar(&v, &outpos, ch) < 0)
6320 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006321 }
6322
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006323 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006324 goto onError;
6325 Py_XDECREF(errorHandler);
6326 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006327 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006328
Benjamin Peterson29060642009-01-31 22:14:21 +00006329 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006330 Py_XDECREF(v);
6331 Py_XDECREF(errorHandler);
6332 Py_XDECREF(exc);
6333 return NULL;
6334}
6335
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336/* --- Latin-1 Codec ------------------------------------------------------ */
6337
Alexander Belopolsky40018472011-02-26 01:02:56 +00006338PyObject *
6339PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006340 Py_ssize_t size,
6341 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006344 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345}
6346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006347/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006348static void
6349make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006350 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006351 PyObject *unicode,
6352 Py_ssize_t startpos, Py_ssize_t endpos,
6353 const char *reason)
6354{
6355 if (*exceptionObject == NULL) {
6356 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006357 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006358 encoding, unicode, startpos, endpos, reason);
6359 }
6360 else {
6361 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6362 goto onError;
6363 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6364 goto onError;
6365 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6366 goto onError;
6367 return;
6368 onError:
6369 Py_DECREF(*exceptionObject);
6370 *exceptionObject = NULL;
6371 }
6372}
6373
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006374/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006375static void
6376raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006377 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006378 PyObject *unicode,
6379 Py_ssize_t startpos, Py_ssize_t endpos,
6380 const char *reason)
6381{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006382 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006383 encoding, unicode, startpos, endpos, reason);
6384 if (*exceptionObject != NULL)
6385 PyCodec_StrictErrors(*exceptionObject);
6386}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006387
6388/* error handling callback helper:
6389 build arguments, call the callback and check the arguments,
6390 put the result into newpos and return the replacement string, which
6391 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006392static PyObject *
6393unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006394 PyObject **errorHandler,
6395 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006396 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006397 Py_ssize_t startpos, Py_ssize_t endpos,
6398 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006400 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006401 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 PyObject *restuple;
6403 PyObject *resunicode;
6404
6405 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409 }
6410
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006411 if (PyUnicode_READY(unicode) < 0)
6412 return NULL;
6413 len = PyUnicode_GET_LENGTH(unicode);
6414
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006415 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006418 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419
6420 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006425 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 Py_DECREF(restuple);
6427 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006429 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 &resunicode, newpos)) {
6431 Py_DECREF(restuple);
6432 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006434 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6435 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6436 Py_DECREF(restuple);
6437 return NULL;
6438 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006439 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006440 *newpos = len + *newpos;
6441 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6443 Py_DECREF(restuple);
6444 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006445 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446 Py_INCREF(resunicode);
6447 Py_DECREF(restuple);
6448 return resunicode;
6449}
6450
Alexander Belopolsky40018472011-02-26 01:02:56 +00006451static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006452unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006453 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006454 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006456 /* input state */
6457 Py_ssize_t pos=0, size;
6458 int kind;
6459 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006460 /* output object */
6461 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462 /* pointer into the output */
6463 char *str;
6464 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006465 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006466 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6467 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468 PyObject *errorHandler = NULL;
6469 PyObject *exc = NULL;
6470 /* the following variable is used for caching string comparisons
6471 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6472 int known_errorHandler = -1;
6473
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 if (PyUnicode_READY(unicode) < 0)
6475 return NULL;
6476 size = PyUnicode_GET_LENGTH(unicode);
6477 kind = PyUnicode_KIND(unicode);
6478 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006479 /* allocate enough for a simple encoding without
6480 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006481 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006482 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006483 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006485 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006486 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006487 ressize = size;
6488
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006489 while (pos < size) {
6490 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 /* can we encode this? */
6493 if (c<limit) {
6494 /* no overflow check, because we know that the space is enough */
6495 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006496 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006497 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 Py_ssize_t requiredsize;
6500 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 Py_ssize_t collstart = pos;
6504 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 ++collend;
6508 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6509 if (known_errorHandler==-1) {
6510 if ((errors==NULL) || (!strcmp(errors, "strict")))
6511 known_errorHandler = 1;
6512 else if (!strcmp(errors, "replace"))
6513 known_errorHandler = 2;
6514 else if (!strcmp(errors, "ignore"))
6515 known_errorHandler = 3;
6516 else if (!strcmp(errors, "xmlcharrefreplace"))
6517 known_errorHandler = 4;
6518 else
6519 known_errorHandler = 0;
6520 }
6521 switch (known_errorHandler) {
6522 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006523 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006524 goto onError;
6525 case 2: /* replace */
6526 while (collstart++<collend)
6527 *str++ = '?'; /* fall through */
6528 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006529 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006530 break;
6531 case 4: /* xmlcharrefreplace */
6532 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006533 /* determine replacement size */
6534 for (i = collstart, repsize = 0; i < collend; ++i) {
6535 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6536 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006538 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006540 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006541 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006542 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006543 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006544 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006545 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006546 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006548 else {
6549 assert(ch <= 0x10FFFF);
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006551 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006553 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 if (requiredsize > ressize) {
6555 if (requiredsize<2*ressize)
6556 requiredsize = 2*ressize;
6557 if (_PyBytes_Resize(&res, requiredsize))
6558 goto onError;
6559 str = PyBytes_AS_STRING(res) + respos;
6560 ressize = requiredsize;
6561 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006562 /* generate replacement */
6563 for (i = collstart; i < collend; ++i) {
6564 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006566 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 break;
6568 default:
6569 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006570 encoding, reason, unicode, &exc,
6571 collstart, collend, &newpos);
6572 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6573 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006575 if (PyBytes_Check(repunicode)) {
6576 /* Directly copy bytes result to output. */
6577 repsize = PyBytes_Size(repunicode);
6578 if (repsize > 1) {
6579 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006580 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006581 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6582 Py_DECREF(repunicode);
6583 goto onError;
6584 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006585 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006586 ressize += repsize-1;
6587 }
6588 memcpy(str, PyBytes_AsString(repunicode), repsize);
6589 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006590 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006591 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006592 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006593 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 /* need more space? (at least enough for what we
6595 have+the replacement+the rest of the string, so
6596 we won't have to check space for encodable characters) */
6597 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006598 repsize = PyUnicode_GET_LENGTH(repunicode);
6599 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006600 if (requiredsize > ressize) {
6601 if (requiredsize<2*ressize)
6602 requiredsize = 2*ressize;
6603 if (_PyBytes_Resize(&res, requiredsize)) {
6604 Py_DECREF(repunicode);
6605 goto onError;
6606 }
6607 str = PyBytes_AS_STRING(res) + respos;
6608 ressize = requiredsize;
6609 }
6610 /* check if there is anything unencodable in the replacement
6611 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612 for (i = 0; repsize-->0; ++i, ++str) {
6613 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006615 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006616 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 Py_DECREF(repunicode);
6618 goto onError;
6619 }
6620 *str = (char)c;
6621 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006622 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006623 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006624 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006625 }
6626 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006627 /* Resize if we allocated to much */
6628 size = str - PyBytes_AS_STRING(res);
6629 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006630 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006631 if (_PyBytes_Resize(&res, size) < 0)
6632 goto onError;
6633 }
6634
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635 Py_XDECREF(errorHandler);
6636 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006637 return res;
6638
6639 onError:
6640 Py_XDECREF(res);
6641 Py_XDECREF(errorHandler);
6642 Py_XDECREF(exc);
6643 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644}
6645
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006646/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006647PyObject *
6648PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006649 Py_ssize_t size,
6650 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006652 PyObject *result;
6653 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6654 if (unicode == NULL)
6655 return NULL;
6656 result = unicode_encode_ucs1(unicode, errors, 256);
6657 Py_DECREF(unicode);
6658 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659}
6660
Alexander Belopolsky40018472011-02-26 01:02:56 +00006661PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006662_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663{
6664 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006665 PyErr_BadArgument();
6666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006668 if (PyUnicode_READY(unicode) == -1)
6669 return NULL;
6670 /* Fast path: if it is a one-byte string, construct
6671 bytes object directly. */
6672 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6673 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6674 PyUnicode_GET_LENGTH(unicode));
6675 /* Non-Latin-1 characters present. Defer to above function to
6676 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006677 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006678}
6679
6680PyObject*
6681PyUnicode_AsLatin1String(PyObject *unicode)
6682{
6683 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684}
6685
6686/* --- 7-bit ASCII Codec -------------------------------------------------- */
6687
Alexander Belopolsky40018472011-02-26 01:02:56 +00006688PyObject *
6689PyUnicode_DecodeASCII(const char *s,
6690 Py_ssize_t size,
6691 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006693 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006694 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006695 int kind;
6696 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006697 Py_ssize_t startinpos;
6698 Py_ssize_t endinpos;
6699 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006701 int has_error;
6702 const unsigned char *p = (const unsigned char *)s;
6703 const unsigned char *end = p + size;
6704 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 PyObject *errorHandler = NULL;
6706 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006707
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006708 if (size == 0) {
6709 Py_INCREF(unicode_empty);
6710 return unicode_empty;
6711 }
6712
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006714 if (size == 1 && (unsigned char)s[0] < 128)
6715 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006716
Victor Stinner702c7342011-10-05 13:50:52 +02006717 has_error = 0;
6718 while (p < end && !has_error) {
6719 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6720 an explanation. */
6721 if (!((size_t) p & LONG_PTR_MASK)) {
6722 /* Help register allocation */
6723 register const unsigned char *_p = p;
6724 while (_p < aligned_end) {
6725 unsigned long value = *(unsigned long *) _p;
6726 if (value & ASCII_CHAR_MASK) {
6727 has_error = 1;
6728 break;
6729 }
6730 _p += SIZEOF_LONG;
6731 }
6732 if (_p == end)
6733 break;
6734 if (has_error)
6735 break;
6736 p = _p;
6737 }
6738 if (*p & 0x80) {
6739 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006740 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006741 }
6742 else {
6743 ++p;
6744 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006745 }
Victor Stinner702c7342011-10-05 13:50:52 +02006746 if (!has_error)
6747 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006748
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006749 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006753 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006754 kind = PyUnicode_KIND(v);
6755 data = PyUnicode_DATA(v);
6756 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006757 e = s + size;
6758 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 register unsigned char c = (unsigned char)*s;
6760 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006761 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006762 ++s;
6763 }
6764 else {
6765 startinpos = s-starts;
6766 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 if (unicode_decode_call_errorhandler(
6768 errors, &errorHandler,
6769 "ascii", "ordinal not in range(128)",
6770 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006771 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006773 kind = PyUnicode_KIND(v);
6774 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006777 if (PyUnicode_Resize(&v, outpos) < 0)
6778 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006779 Py_XDECREF(errorHandler);
6780 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006781 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006782 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006783
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006786 Py_XDECREF(errorHandler);
6787 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 return NULL;
6789}
6790
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006791/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006792PyObject *
6793PyUnicode_EncodeASCII(const Py_UNICODE *p,
6794 Py_ssize_t size,
6795 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006797 PyObject *result;
6798 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6799 if (unicode == NULL)
6800 return NULL;
6801 result = unicode_encode_ucs1(unicode, errors, 128);
6802 Py_DECREF(unicode);
6803 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804}
6805
Alexander Belopolsky40018472011-02-26 01:02:56 +00006806PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006807_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808{
6809 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 PyErr_BadArgument();
6811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006813 if (PyUnicode_READY(unicode) == -1)
6814 return NULL;
6815 /* Fast path: if it is an ASCII-only string, construct bytes object
6816 directly. Else defer to above function to raise the exception. */
6817 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6818 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6819 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006820 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006821}
6822
6823PyObject *
6824PyUnicode_AsASCIIString(PyObject *unicode)
6825{
6826 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006827}
6828
Victor Stinner99b95382011-07-04 14:23:54 +02006829#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006830
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006831/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006832
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006833#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006834#define NEED_RETRY
6835#endif
6836
Victor Stinner3a50e702011-10-18 21:21:00 +02006837#ifndef WC_ERR_INVALID_CHARS
6838# define WC_ERR_INVALID_CHARS 0x0080
6839#endif
6840
6841static char*
6842code_page_name(UINT code_page, PyObject **obj)
6843{
6844 *obj = NULL;
6845 if (code_page == CP_ACP)
6846 return "mbcs";
6847 if (code_page == CP_UTF7)
6848 return "CP_UTF7";
6849 if (code_page == CP_UTF8)
6850 return "CP_UTF8";
6851
6852 *obj = PyBytes_FromFormat("cp%u", code_page);
6853 if (*obj == NULL)
6854 return NULL;
6855 return PyBytes_AS_STRING(*obj);
6856}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006857
Alexander Belopolsky40018472011-02-26 01:02:56 +00006858static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006859is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006860{
6861 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006862 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006863
Victor Stinner3a50e702011-10-18 21:21:00 +02006864 if (!IsDBCSLeadByteEx(code_page, *curr))
6865 return 0;
6866
6867 prev = CharPrevExA(code_page, s, curr, 0);
6868 if (prev == curr)
6869 return 1;
6870 /* FIXME: This code is limited to "true" double-byte encodings,
6871 as it assumes an incomplete character consists of a single
6872 byte. */
6873 if (curr - prev == 2)
6874 return 1;
6875 if (!IsDBCSLeadByteEx(code_page, *prev))
6876 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006877 return 0;
6878}
6879
Victor Stinner3a50e702011-10-18 21:21:00 +02006880static DWORD
6881decode_code_page_flags(UINT code_page)
6882{
6883 if (code_page == CP_UTF7) {
6884 /* The CP_UTF7 decoder only supports flags=0 */
6885 return 0;
6886 }
6887 else
6888 return MB_ERR_INVALID_CHARS;
6889}
6890
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006891/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 * Decode a byte string from a Windows code page into unicode object in strict
6893 * mode.
6894 *
6895 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6896 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006897 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006898static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006899decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006900 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006901 const char *in,
6902 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006903{
Victor Stinner3a50e702011-10-18 21:21:00 +02006904 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006905 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006906 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006907
6908 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006909 assert(insize > 0);
6910 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6911 if (outsize <= 0)
6912 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006913
6914 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006916 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 if (*v == NULL)
6918 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006919 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006920 }
6921 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006922 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006923 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006924 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006926 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006927 }
6928
6929 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006930 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6931 if (outsize <= 0)
6932 goto error;
6933 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006934
Victor Stinner3a50e702011-10-18 21:21:00 +02006935error:
6936 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6937 return -2;
6938 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006939 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006940}
6941
Victor Stinner3a50e702011-10-18 21:21:00 +02006942/*
6943 * Decode a byte string from a code page into unicode object with an error
6944 * handler.
6945 *
6946 * Returns consumed size if succeed, or raise a WindowsError or
6947 * UnicodeDecodeError exception and returns -1 on error.
6948 */
6949static int
6950decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006951 PyObject **v,
6952 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006953 const char *errors)
6954{
6955 const char *startin = in;
6956 const char *endin = in + size;
6957 const DWORD flags = decode_code_page_flags(code_page);
6958 /* Ideally, we should get reason from FormatMessage. This is the Windows
6959 2000 English version of the message. */
6960 const char *reason = "No mapping for the Unicode character exists "
6961 "in the target code page.";
6962 /* each step cannot decode more than 1 character, but a character can be
6963 represented as a surrogate pair */
6964 wchar_t buffer[2], *startout, *out;
6965 int insize, outsize;
6966 PyObject *errorHandler = NULL;
6967 PyObject *exc = NULL;
6968 PyObject *encoding_obj = NULL;
6969 char *encoding;
6970 DWORD err;
6971 int ret = -1;
6972
6973 assert(size > 0);
6974
6975 encoding = code_page_name(code_page, &encoding_obj);
6976 if (encoding == NULL)
6977 return -1;
6978
6979 if (errors == NULL || strcmp(errors, "strict") == 0) {
6980 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6981 UnicodeDecodeError. */
6982 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6983 if (exc != NULL) {
6984 PyCodec_StrictErrors(exc);
6985 Py_CLEAR(exc);
6986 }
6987 goto error;
6988 }
6989
6990 if (*v == NULL) {
6991 /* Create unicode object */
6992 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6993 PyErr_NoMemory();
6994 goto error;
6995 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006996 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006997 if (*v == NULL)
6998 goto error;
6999 startout = PyUnicode_AS_UNICODE(*v);
7000 }
7001 else {
7002 /* Extend unicode object */
7003 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7004 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7005 PyErr_NoMemory();
7006 goto error;
7007 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007008 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 goto error;
7010 startout = PyUnicode_AS_UNICODE(*v) + n;
7011 }
7012
7013 /* Decode the byte string character per character */
7014 out = startout;
7015 while (in < endin)
7016 {
7017 /* Decode a character */
7018 insize = 1;
7019 do
7020 {
7021 outsize = MultiByteToWideChar(code_page, flags,
7022 in, insize,
7023 buffer, Py_ARRAY_LENGTH(buffer));
7024 if (outsize > 0)
7025 break;
7026 err = GetLastError();
7027 if (err != ERROR_NO_UNICODE_TRANSLATION
7028 && err != ERROR_INSUFFICIENT_BUFFER)
7029 {
7030 PyErr_SetFromWindowsErr(0);
7031 goto error;
7032 }
7033 insize++;
7034 }
7035 /* 4=maximum length of a UTF-8 sequence */
7036 while (insize <= 4 && (in + insize) <= endin);
7037
7038 if (outsize <= 0) {
7039 Py_ssize_t startinpos, endinpos, outpos;
7040
7041 startinpos = in - startin;
7042 endinpos = startinpos + 1;
7043 outpos = out - PyUnicode_AS_UNICODE(*v);
7044 if (unicode_decode_call_errorhandler(
7045 errors, &errorHandler,
7046 encoding, reason,
7047 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007048 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007049 {
7050 goto error;
7051 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007052 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007053 }
7054 else {
7055 in += insize;
7056 memcpy(out, buffer, outsize * sizeof(wchar_t));
7057 out += outsize;
7058 }
7059 }
7060
7061 /* write a NUL character at the end */
7062 *out = 0;
7063
7064 /* Extend unicode object */
7065 outsize = out - startout;
7066 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007067 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007068 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007069 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007070
7071error:
7072 Py_XDECREF(encoding_obj);
7073 Py_XDECREF(errorHandler);
7074 Py_XDECREF(exc);
7075 return ret;
7076}
7077
Victor Stinner3a50e702011-10-18 21:21:00 +02007078static PyObject *
7079decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007080 const char *s, Py_ssize_t size,
7081 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007082{
Victor Stinner76a31a62011-11-04 00:05:13 +01007083 PyObject *v = NULL;
7084 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 if (code_page < 0) {
7087 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7088 return NULL;
7089 }
7090
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007092 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007093
Victor Stinner76a31a62011-11-04 00:05:13 +01007094 do
7095 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007096#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007097 if (size > INT_MAX) {
7098 chunk_size = INT_MAX;
7099 final = 0;
7100 done = 0;
7101 }
7102 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007103#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007104 {
7105 chunk_size = (int)size;
7106 final = (consumed == NULL);
7107 done = 1;
7108 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007109
Victor Stinner76a31a62011-11-04 00:05:13 +01007110 /* Skip trailing lead-byte unless 'final' is set */
7111 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7112 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113
Victor Stinner76a31a62011-11-04 00:05:13 +01007114 if (chunk_size == 0 && done) {
7115 if (v != NULL)
7116 break;
7117 Py_INCREF(unicode_empty);
7118 return unicode_empty;
7119 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007120
Victor Stinner76a31a62011-11-04 00:05:13 +01007121
7122 converted = decode_code_page_strict(code_page, &v,
7123 s, chunk_size);
7124 if (converted == -2)
7125 converted = decode_code_page_errors(code_page, &v,
7126 s, chunk_size,
7127 errors);
7128 assert(converted != 0);
7129
7130 if (converted < 0) {
7131 Py_XDECREF(v);
7132 return NULL;
7133 }
7134
7135 if (consumed)
7136 *consumed += converted;
7137
7138 s += converted;
7139 size -= converted;
7140 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007141
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007142 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143}
7144
Alexander Belopolsky40018472011-02-26 01:02:56 +00007145PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007146PyUnicode_DecodeCodePageStateful(int code_page,
7147 const char *s,
7148 Py_ssize_t size,
7149 const char *errors,
7150 Py_ssize_t *consumed)
7151{
7152 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7153}
7154
7155PyObject *
7156PyUnicode_DecodeMBCSStateful(const char *s,
7157 Py_ssize_t size,
7158 const char *errors,
7159 Py_ssize_t *consumed)
7160{
7161 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7162}
7163
7164PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007165PyUnicode_DecodeMBCS(const char *s,
7166 Py_ssize_t size,
7167 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007168{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007169 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7170}
7171
Victor Stinner3a50e702011-10-18 21:21:00 +02007172static DWORD
7173encode_code_page_flags(UINT code_page, const char *errors)
7174{
7175 if (code_page == CP_UTF8) {
7176 if (winver.dwMajorVersion >= 6)
7177 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7178 and later */
7179 return WC_ERR_INVALID_CHARS;
7180 else
7181 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7182 return 0;
7183 }
7184 else if (code_page == CP_UTF7) {
7185 /* CP_UTF7 only supports flags=0 */
7186 return 0;
7187 }
7188 else {
7189 if (errors != NULL && strcmp(errors, "replace") == 0)
7190 return 0;
7191 else
7192 return WC_NO_BEST_FIT_CHARS;
7193 }
7194}
7195
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007196/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 * Encode a Unicode string to a Windows code page into a byte string in strict
7198 * mode.
7199 *
7200 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7201 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007203static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007204encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007205 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007206 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007207{
Victor Stinner554f3f02010-06-16 23:33:54 +00007208 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 BOOL *pusedDefaultChar = &usedDefaultChar;
7210 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007211 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007212 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007213 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 const DWORD flags = encode_code_page_flags(code_page, NULL);
7215 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007216 /* Create a substring so that we can get the UTF-16 representation
7217 of just the slice under consideration. */
7218 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007219
Martin v. Löwis3d325192011-11-04 18:23:06 +01007220 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007221
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007223 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007225 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007226
Victor Stinner2fc507f2011-11-04 20:06:39 +01007227 substring = PyUnicode_Substring(unicode, offset, offset+len);
7228 if (substring == NULL)
7229 return -1;
7230 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7231 if (p == NULL) {
7232 Py_DECREF(substring);
7233 return -1;
7234 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007235
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007236 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007237 outsize = WideCharToMultiByte(code_page, flags,
7238 p, size,
7239 NULL, 0,
7240 NULL, pusedDefaultChar);
7241 if (outsize <= 0)
7242 goto error;
7243 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007244 if (pusedDefaultChar && *pusedDefaultChar) {
7245 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007246 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007247 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007248
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007252 if (*outbytes == NULL) {
7253 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007255 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007256 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007257 }
7258 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 const Py_ssize_t n = PyBytes_Size(*outbytes);
7261 if (outsize > PY_SSIZE_T_MAX - n) {
7262 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007263 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007264 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007266 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7267 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007269 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007271 }
7272
7273 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 outsize = WideCharToMultiByte(code_page, flags,
7275 p, size,
7276 out, outsize,
7277 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007278 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007279 if (outsize <= 0)
7280 goto error;
7281 if (pusedDefaultChar && *pusedDefaultChar)
7282 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007283 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007284
Victor Stinner3a50e702011-10-18 21:21:00 +02007285error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007286 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007287 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7288 return -2;
7289 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007290 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007291}
7292
Victor Stinner3a50e702011-10-18 21:21:00 +02007293/*
7294 * Encode a Unicode string to a Windows code page into a byte string using a
7295 * error handler.
7296 *
7297 * Returns consumed characters if succeed, or raise a WindowsError and returns
7298 * -1 on other error.
7299 */
7300static int
7301encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007302 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007303 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007304{
Victor Stinner3a50e702011-10-18 21:21:00 +02007305 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007306 Py_ssize_t pos = unicode_offset;
7307 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 /* Ideally, we should get reason from FormatMessage. This is the Windows
7309 2000 English version of the message. */
7310 const char *reason = "invalid character";
7311 /* 4=maximum length of a UTF-8 sequence */
7312 char buffer[4];
7313 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7314 Py_ssize_t outsize;
7315 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 PyObject *errorHandler = NULL;
7317 PyObject *exc = NULL;
7318 PyObject *encoding_obj = NULL;
7319 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007320 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007321 PyObject *rep;
7322 int ret = -1;
7323
7324 assert(insize > 0);
7325
7326 encoding = code_page_name(code_page, &encoding_obj);
7327 if (encoding == NULL)
7328 return -1;
7329
7330 if (errors == NULL || strcmp(errors, "strict") == 0) {
7331 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7332 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007333 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007334 if (exc != NULL) {
7335 PyCodec_StrictErrors(exc);
7336 Py_DECREF(exc);
7337 }
7338 Py_XDECREF(encoding_obj);
7339 return -1;
7340 }
7341
7342 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7343 pusedDefaultChar = &usedDefaultChar;
7344 else
7345 pusedDefaultChar = NULL;
7346
7347 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7348 PyErr_NoMemory();
7349 goto error;
7350 }
7351 outsize = insize * Py_ARRAY_LENGTH(buffer);
7352
7353 if (*outbytes == NULL) {
7354 /* Create string object */
7355 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7356 if (*outbytes == NULL)
7357 goto error;
7358 out = PyBytes_AS_STRING(*outbytes);
7359 }
7360 else {
7361 /* Extend string object */
7362 Py_ssize_t n = PyBytes_Size(*outbytes);
7363 if (n > PY_SSIZE_T_MAX - outsize) {
7364 PyErr_NoMemory();
7365 goto error;
7366 }
7367 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7368 goto error;
7369 out = PyBytes_AS_STRING(*outbytes) + n;
7370 }
7371
7372 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007373 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007374 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007375 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7376 wchar_t chars[2];
7377 int charsize;
7378 if (ch < 0x10000) {
7379 chars[0] = (wchar_t)ch;
7380 charsize = 1;
7381 }
7382 else {
7383 ch -= 0x10000;
7384 chars[0] = 0xd800 + (ch >> 10);
7385 chars[1] = 0xdc00 + (ch & 0x3ff);
7386 charsize = 2;
7387 }
7388
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007390 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 buffer, Py_ARRAY_LENGTH(buffer),
7392 NULL, pusedDefaultChar);
7393 if (outsize > 0) {
7394 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7395 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007396 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 memcpy(out, buffer, outsize);
7398 out += outsize;
7399 continue;
7400 }
7401 }
7402 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7403 PyErr_SetFromWindowsErr(0);
7404 goto error;
7405 }
7406
Victor Stinner3a50e702011-10-18 21:21:00 +02007407 rep = unicode_encode_call_errorhandler(
7408 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007409 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007410 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 if (rep == NULL)
7412 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007413 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007414
7415 if (PyBytes_Check(rep)) {
7416 outsize = PyBytes_GET_SIZE(rep);
7417 if (outsize != 1) {
7418 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7419 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7420 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7421 Py_DECREF(rep);
7422 goto error;
7423 }
7424 out = PyBytes_AS_STRING(*outbytes) + offset;
7425 }
7426 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7427 out += outsize;
7428 }
7429 else {
7430 Py_ssize_t i;
7431 enum PyUnicode_Kind kind;
7432 void *data;
7433
7434 if (PyUnicode_READY(rep) < 0) {
7435 Py_DECREF(rep);
7436 goto error;
7437 }
7438
7439 outsize = PyUnicode_GET_LENGTH(rep);
7440 if (outsize != 1) {
7441 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7442 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7443 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7444 Py_DECREF(rep);
7445 goto error;
7446 }
7447 out = PyBytes_AS_STRING(*outbytes) + offset;
7448 }
7449 kind = PyUnicode_KIND(rep);
7450 data = PyUnicode_DATA(rep);
7451 for (i=0; i < outsize; i++) {
7452 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7453 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007454 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007455 encoding, unicode,
7456 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007457 "unable to encode error handler result to ASCII");
7458 Py_DECREF(rep);
7459 goto error;
7460 }
7461 *out = (unsigned char)ch;
7462 out++;
7463 }
7464 }
7465 Py_DECREF(rep);
7466 }
7467 /* write a NUL byte */
7468 *out = 0;
7469 outsize = out - PyBytes_AS_STRING(*outbytes);
7470 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7471 if (_PyBytes_Resize(outbytes, outsize) < 0)
7472 goto error;
7473 ret = 0;
7474
7475error:
7476 Py_XDECREF(encoding_obj);
7477 Py_XDECREF(errorHandler);
7478 Py_XDECREF(exc);
7479 return ret;
7480}
7481
Victor Stinner3a50e702011-10-18 21:21:00 +02007482static PyObject *
7483encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007484 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 const char *errors)
7486{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007487 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007489 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007490 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007491
Victor Stinner2fc507f2011-11-04 20:06:39 +01007492 if (PyUnicode_READY(unicode) < 0)
7493 return NULL;
7494 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007495
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 if (code_page < 0) {
7497 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7498 return NULL;
7499 }
7500
Martin v. Löwis3d325192011-11-04 18:23:06 +01007501 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007502 return PyBytes_FromStringAndSize(NULL, 0);
7503
Victor Stinner7581cef2011-11-03 22:32:33 +01007504 offset = 0;
7505 do
7506 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007507#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007508 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007509 chunks. */
7510 if (len > INT_MAX/2) {
7511 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007512 done = 0;
7513 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007514 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007515#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007516 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007517 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007518 done = 1;
7519 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007520
Victor Stinner76a31a62011-11-04 00:05:13 +01007521 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007522 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007523 errors);
7524 if (ret == -2)
7525 ret = encode_code_page_errors(code_page, &outbytes,
7526 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007527 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007528 if (ret < 0) {
7529 Py_XDECREF(outbytes);
7530 return NULL;
7531 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007532
Victor Stinner7581cef2011-11-03 22:32:33 +01007533 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007534 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007535 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007536
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 return outbytes;
7538}
7539
7540PyObject *
7541PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7542 Py_ssize_t size,
7543 const char *errors)
7544{
Victor Stinner7581cef2011-11-03 22:32:33 +01007545 PyObject *unicode, *res;
7546 unicode = PyUnicode_FromUnicode(p, size);
7547 if (unicode == NULL)
7548 return NULL;
7549 res = encode_code_page(CP_ACP, unicode, errors);
7550 Py_DECREF(unicode);
7551 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007552}
7553
7554PyObject *
7555PyUnicode_EncodeCodePage(int code_page,
7556 PyObject *unicode,
7557 const char *errors)
7558{
Victor Stinner7581cef2011-11-03 22:32:33 +01007559 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007560}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007561
Alexander Belopolsky40018472011-02-26 01:02:56 +00007562PyObject *
7563PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007564{
7565 if (!PyUnicode_Check(unicode)) {
7566 PyErr_BadArgument();
7567 return NULL;
7568 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007569 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007570}
7571
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007572#undef NEED_RETRY
7573
Victor Stinner99b95382011-07-04 14:23:54 +02007574#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007575
Guido van Rossumd57fd912000-03-10 22:53:23 +00007576/* --- Character Mapping Codec -------------------------------------------- */
7577
Alexander Belopolsky40018472011-02-26 01:02:56 +00007578PyObject *
7579PyUnicode_DecodeCharmap(const char *s,
7580 Py_ssize_t size,
7581 PyObject *mapping,
7582 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007584 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007585 Py_ssize_t startinpos;
7586 Py_ssize_t endinpos;
7587 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007588 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007589 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007590 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007591 PyObject *errorHandler = NULL;
7592 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007593
Guido van Rossumd57fd912000-03-10 22:53:23 +00007594 /* Default to Latin-1 */
7595 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007597
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007598 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007599 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007602 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007603 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007604 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007605 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007606 Py_ssize_t maplen;
7607 enum PyUnicode_Kind kind;
7608 void *data;
7609 Py_UCS4 x;
7610
7611 if (PyUnicode_READY(mapping) < 0)
7612 return NULL;
7613
7614 maplen = PyUnicode_GET_LENGTH(mapping);
7615 data = PyUnicode_DATA(mapping);
7616 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007617 while (s < e) {
7618 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007619
Benjamin Peterson29060642009-01-31 22:14:21 +00007620 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007621 x = PyUnicode_READ(kind, data, ch);
7622 else
7623 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007624
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007625 if (x == 0xfffe)
7626 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 startinpos = s-starts;
7629 endinpos = startinpos+1;
7630 if (unicode_decode_call_errorhandler(
7631 errors, &errorHandler,
7632 "charmap", "character maps to <undefined>",
7633 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007634 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 goto onError;
7636 }
7637 continue;
7638 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007639
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007640 if (unicode_putchar(&v, &outpos, x) < 0)
7641 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007643 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007644 }
7645 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 while (s < e) {
7647 unsigned char ch = *s;
7648 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007649
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7651 w = PyLong_FromLong((long)ch);
7652 if (w == NULL)
7653 goto onError;
7654 x = PyObject_GetItem(mapping, w);
7655 Py_DECREF(w);
7656 if (x == NULL) {
7657 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7658 /* No mapping found means: mapping is undefined. */
7659 PyErr_Clear();
7660 x = Py_None;
7661 Py_INCREF(x);
7662 } else
7663 goto onError;
7664 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007665
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 /* Apply mapping */
7667 if (PyLong_Check(x)) {
7668 long value = PyLong_AS_LONG(x);
7669 if (value < 0 || value > 65535) {
7670 PyErr_SetString(PyExc_TypeError,
7671 "character mapping must be in range(65536)");
7672 Py_DECREF(x);
7673 goto onError;
7674 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007675 if (unicode_putchar(&v, &outpos, value) < 0)
7676 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 }
7678 else if (x == Py_None) {
7679 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007680 startinpos = s-starts;
7681 endinpos = startinpos+1;
7682 if (unicode_decode_call_errorhandler(
7683 errors, &errorHandler,
7684 "charmap", "character maps to <undefined>",
7685 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007686 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 Py_DECREF(x);
7688 goto onError;
7689 }
7690 Py_DECREF(x);
7691 continue;
7692 }
7693 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007694 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007695
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007696 if (PyUnicode_READY(x) < 0)
7697 goto onError;
7698 targetsize = PyUnicode_GET_LENGTH(x);
7699
7700 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007702 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007703 PyUnicode_READ_CHAR(x, 0)) < 0)
7704 goto onError;
7705 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 else if (targetsize > 1) {
7707 /* 1-n mapping */
7708 if (targetsize > extrachars) {
7709 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 Py_ssize_t needed = (targetsize - extrachars) + \
7711 (targetsize << 2);
7712 extrachars += needed;
7713 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007714 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007715 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007716 Py_DECREF(x);
7717 goto onError;
7718 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007720 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7721 goto onError;
7722 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7723 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 extrachars -= targetsize;
7725 }
7726 /* 1-0 mapping: skip the character */
7727 }
7728 else {
7729 /* wrong return value */
7730 PyErr_SetString(PyExc_TypeError,
7731 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007732 Py_DECREF(x);
7733 goto onError;
7734 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 Py_DECREF(x);
7736 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007738 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007739 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007740 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007741 Py_XDECREF(errorHandler);
7742 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007743 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007744
Benjamin Peterson29060642009-01-31 22:14:21 +00007745 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007746 Py_XDECREF(errorHandler);
7747 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007748 Py_XDECREF(v);
7749 return NULL;
7750}
7751
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007752/* Charmap encoding: the lookup table */
7753
Alexander Belopolsky40018472011-02-26 01:02:56 +00007754struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 PyObject_HEAD
7756 unsigned char level1[32];
7757 int count2, count3;
7758 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007759};
7760
7761static PyObject*
7762encoding_map_size(PyObject *obj, PyObject* args)
7763{
7764 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007765 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767}
7768
7769static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007770 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 PyDoc_STR("Return the size (in bytes) of this object") },
7772 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773};
7774
7775static void
7776encoding_map_dealloc(PyObject* o)
7777{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007778 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007779}
7780
7781static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007782 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 "EncodingMap", /*tp_name*/
7784 sizeof(struct encoding_map), /*tp_basicsize*/
7785 0, /*tp_itemsize*/
7786 /* methods */
7787 encoding_map_dealloc, /*tp_dealloc*/
7788 0, /*tp_print*/
7789 0, /*tp_getattr*/
7790 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007791 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 0, /*tp_repr*/
7793 0, /*tp_as_number*/
7794 0, /*tp_as_sequence*/
7795 0, /*tp_as_mapping*/
7796 0, /*tp_hash*/
7797 0, /*tp_call*/
7798 0, /*tp_str*/
7799 0, /*tp_getattro*/
7800 0, /*tp_setattro*/
7801 0, /*tp_as_buffer*/
7802 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7803 0, /*tp_doc*/
7804 0, /*tp_traverse*/
7805 0, /*tp_clear*/
7806 0, /*tp_richcompare*/
7807 0, /*tp_weaklistoffset*/
7808 0, /*tp_iter*/
7809 0, /*tp_iternext*/
7810 encoding_map_methods, /*tp_methods*/
7811 0, /*tp_members*/
7812 0, /*tp_getset*/
7813 0, /*tp_base*/
7814 0, /*tp_dict*/
7815 0, /*tp_descr_get*/
7816 0, /*tp_descr_set*/
7817 0, /*tp_dictoffset*/
7818 0, /*tp_init*/
7819 0, /*tp_alloc*/
7820 0, /*tp_new*/
7821 0, /*tp_free*/
7822 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823};
7824
7825PyObject*
7826PyUnicode_BuildEncodingMap(PyObject* string)
7827{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007828 PyObject *result;
7829 struct encoding_map *mresult;
7830 int i;
7831 int need_dict = 0;
7832 unsigned char level1[32];
7833 unsigned char level2[512];
7834 unsigned char *mlevel1, *mlevel2, *mlevel3;
7835 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007836 int kind;
7837 void *data;
7838 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007840 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007841 PyErr_BadArgument();
7842 return NULL;
7843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 kind = PyUnicode_KIND(string);
7845 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007846 memset(level1, 0xFF, sizeof level1);
7847 memset(level2, 0xFF, sizeof level2);
7848
7849 /* If there isn't a one-to-one mapping of NULL to \0,
7850 or if there are non-BMP characters, we need to use
7851 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007852 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007853 need_dict = 1;
7854 for (i = 1; i < 256; i++) {
7855 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007856 ch = PyUnicode_READ(kind, data, i);
7857 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007858 need_dict = 1;
7859 break;
7860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007861 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007862 /* unmapped character */
7863 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007864 l1 = ch >> 11;
7865 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007866 if (level1[l1] == 0xFF)
7867 level1[l1] = count2++;
7868 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007869 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007870 }
7871
7872 if (count2 >= 0xFF || count3 >= 0xFF)
7873 need_dict = 1;
7874
7875 if (need_dict) {
7876 PyObject *result = PyDict_New();
7877 PyObject *key, *value;
7878 if (!result)
7879 return NULL;
7880 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007881 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007882 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007883 if (!key || !value)
7884 goto failed1;
7885 if (PyDict_SetItem(result, key, value) == -1)
7886 goto failed1;
7887 Py_DECREF(key);
7888 Py_DECREF(value);
7889 }
7890 return result;
7891 failed1:
7892 Py_XDECREF(key);
7893 Py_XDECREF(value);
7894 Py_DECREF(result);
7895 return NULL;
7896 }
7897
7898 /* Create a three-level trie */
7899 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7900 16*count2 + 128*count3 - 1);
7901 if (!result)
7902 return PyErr_NoMemory();
7903 PyObject_Init(result, &EncodingMapType);
7904 mresult = (struct encoding_map*)result;
7905 mresult->count2 = count2;
7906 mresult->count3 = count3;
7907 mlevel1 = mresult->level1;
7908 mlevel2 = mresult->level23;
7909 mlevel3 = mresult->level23 + 16*count2;
7910 memcpy(mlevel1, level1, 32);
7911 memset(mlevel2, 0xFF, 16*count2);
7912 memset(mlevel3, 0, 128*count3);
7913 count3 = 0;
7914 for (i = 1; i < 256; i++) {
7915 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007916 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007917 /* unmapped character */
7918 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007919 o1 = PyUnicode_READ(kind, data, i)>>11;
7920 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007921 i2 = 16*mlevel1[o1] + o2;
7922 if (mlevel2[i2] == 0xFF)
7923 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007925 i3 = 128*mlevel2[i2] + o3;
7926 mlevel3[i3] = i;
7927 }
7928 return result;
7929}
7930
7931static int
Victor Stinner22168992011-11-20 17:09:18 +01007932encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007933{
7934 struct encoding_map *map = (struct encoding_map*)mapping;
7935 int l1 = c>>11;
7936 int l2 = (c>>7) & 0xF;
7937 int l3 = c & 0x7F;
7938 int i;
7939
Victor Stinner22168992011-11-20 17:09:18 +01007940 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007942 if (c == 0)
7943 return 0;
7944 /* level 1*/
7945 i = map->level1[l1];
7946 if (i == 0xFF) {
7947 return -1;
7948 }
7949 /* level 2*/
7950 i = map->level23[16*i+l2];
7951 if (i == 0xFF) {
7952 return -1;
7953 }
7954 /* level 3 */
7955 i = map->level23[16*map->count2 + 128*i + l3];
7956 if (i == 0) {
7957 return -1;
7958 }
7959 return i;
7960}
7961
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962/* Lookup the character ch in the mapping. If the character
7963 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007964 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007965static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007966charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967{
Christian Heimes217cfd12007-12-02 14:31:20 +00007968 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969 PyObject *x;
7970
7971 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007972 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007973 x = PyObject_GetItem(mapping, w);
7974 Py_DECREF(w);
7975 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7977 /* No mapping found means: mapping is undefined. */
7978 PyErr_Clear();
7979 x = Py_None;
7980 Py_INCREF(x);
7981 return x;
7982 } else
7983 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007985 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007987 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 long value = PyLong_AS_LONG(x);
7989 if (value < 0 || value > 255) {
7990 PyErr_SetString(PyExc_TypeError,
7991 "character mapping must be in range(256)");
7992 Py_DECREF(x);
7993 return NULL;
7994 }
7995 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007996 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007997 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 /* wrong return value */
8001 PyErr_Format(PyExc_TypeError,
8002 "character mapping must return integer, bytes or None, not %.400s",
8003 x->ob_type->tp_name);
8004 Py_DECREF(x);
8005 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006 }
8007}
8008
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008009static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008010charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008011{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008012 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8013 /* exponentially overallocate to minimize reallocations */
8014 if (requiredsize < 2*outsize)
8015 requiredsize = 2*outsize;
8016 if (_PyBytes_Resize(outobj, requiredsize))
8017 return -1;
8018 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008019}
8020
Benjamin Peterson14339b62009-01-31 16:36:08 +00008021typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008022 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008023} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008024/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008025 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008026 space is available. Return a new reference to the object that
8027 was put in the output buffer, or Py_None, if the mapping was undefined
8028 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008029 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008030static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008031charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008032 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008033{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008034 PyObject *rep;
8035 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008036 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008037
Christian Heimes90aa7642007-12-19 02:45:37 +00008038 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008039 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008041 if (res == -1)
8042 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 if (outsize<requiredsize)
8044 if (charmapencode_resize(outobj, outpos, requiredsize))
8045 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008046 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 outstart[(*outpos)++] = (char)res;
8048 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049 }
8050
8051 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008052 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008054 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 Py_DECREF(rep);
8056 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008057 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 if (PyLong_Check(rep)) {
8059 Py_ssize_t requiredsize = *outpos+1;
8060 if (outsize<requiredsize)
8061 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8062 Py_DECREF(rep);
8063 return enc_EXCEPTION;
8064 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008065 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008067 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 else {
8069 const char *repchars = PyBytes_AS_STRING(rep);
8070 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8071 Py_ssize_t requiredsize = *outpos+repsize;
8072 if (outsize<requiredsize)
8073 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8074 Py_DECREF(rep);
8075 return enc_EXCEPTION;
8076 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008077 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 memcpy(outstart + *outpos, repchars, repsize);
8079 *outpos += repsize;
8080 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008082 Py_DECREF(rep);
8083 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084}
8085
8086/* handle an error in PyUnicode_EncodeCharmap
8087 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008088static int
8089charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008090 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008092 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008093 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094{
8095 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008096 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008097 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008098 enum PyUnicode_Kind kind;
8099 void *data;
8100 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008102 Py_ssize_t collstartpos = *inpos;
8103 Py_ssize_t collendpos = *inpos+1;
8104 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008105 char *encoding = "charmap";
8106 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008108 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008109 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008111 if (PyUnicode_READY(unicode) < 0)
8112 return -1;
8113 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008114 /* find all unencodable characters */
8115 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008117 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008118 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008119 val = encoding_map_lookup(ch, mapping);
8120 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 break;
8122 ++collendpos;
8123 continue;
8124 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008125
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008126 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8127 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008128 if (rep==NULL)
8129 return -1;
8130 else if (rep!=Py_None) {
8131 Py_DECREF(rep);
8132 break;
8133 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008134 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136 }
8137 /* cache callback name lookup
8138 * (if not done yet, i.e. it's the first error) */
8139 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008140 if ((errors==NULL) || (!strcmp(errors, "strict")))
8141 *known_errorHandler = 1;
8142 else if (!strcmp(errors, "replace"))
8143 *known_errorHandler = 2;
8144 else if (!strcmp(errors, "ignore"))
8145 *known_errorHandler = 3;
8146 else if (!strcmp(errors, "xmlcharrefreplace"))
8147 *known_errorHandler = 4;
8148 else
8149 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150 }
8151 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008152 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008153 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008154 return -1;
8155 case 2: /* replace */
8156 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 x = charmapencode_output('?', mapping, res, respos);
8158 if (x==enc_EXCEPTION) {
8159 return -1;
8160 }
8161 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008162 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 return -1;
8164 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008165 }
8166 /* fall through */
8167 case 3: /* ignore */
8168 *inpos = collendpos;
8169 break;
8170 case 4: /* xmlcharrefreplace */
8171 /* generate replacement (temporarily (mis)uses p) */
8172 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 char buffer[2+29+1+1];
8174 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008175 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 for (cp = buffer; *cp; ++cp) {
8177 x = charmapencode_output(*cp, mapping, res, respos);
8178 if (x==enc_EXCEPTION)
8179 return -1;
8180 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008181 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 return -1;
8183 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008184 }
8185 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008186 *inpos = collendpos;
8187 break;
8188 default:
8189 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008190 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008194 if (PyBytes_Check(repunicode)) {
8195 /* Directly copy bytes result to output. */
8196 Py_ssize_t outsize = PyBytes_Size(*res);
8197 Py_ssize_t requiredsize;
8198 repsize = PyBytes_Size(repunicode);
8199 requiredsize = *respos + repsize;
8200 if (requiredsize > outsize)
8201 /* Make room for all additional bytes. */
8202 if (charmapencode_resize(res, respos, requiredsize)) {
8203 Py_DECREF(repunicode);
8204 return -1;
8205 }
8206 memcpy(PyBytes_AsString(*res) + *respos,
8207 PyBytes_AsString(repunicode), repsize);
8208 *respos += repsize;
8209 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008210 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008211 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008212 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008213 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008214 if (PyUnicode_READY(repunicode) < 0) {
8215 Py_DECREF(repunicode);
8216 return -1;
8217 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008218 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008219 data = PyUnicode_DATA(repunicode);
8220 kind = PyUnicode_KIND(repunicode);
8221 for (index = 0; index < repsize; index++) {
8222 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8223 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008225 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 return -1;
8227 }
8228 else if (x==enc_FAILED) {
8229 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008230 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 return -1;
8232 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008233 }
8234 *inpos = newpos;
8235 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 }
8237 return 0;
8238}
8239
Alexander Belopolsky40018472011-02-26 01:02:56 +00008240PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008241_PyUnicode_EncodeCharmap(PyObject *unicode,
8242 PyObject *mapping,
8243 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008244{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245 /* output object */
8246 PyObject *res = NULL;
8247 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008248 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008249 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008250 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008251 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 PyObject *errorHandler = NULL;
8253 PyObject *exc = NULL;
8254 /* the following variable is used for caching string comparisons
8255 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8256 * 3=ignore, 4=xmlcharrefreplace */
8257 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008258
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008259 if (PyUnicode_READY(unicode) < 0)
8260 return NULL;
8261 size = PyUnicode_GET_LENGTH(unicode);
8262
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263 /* Default to Latin-1 */
8264 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008265 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267 /* allocate enough for a simple encoding without
8268 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008269 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 if (res == NULL)
8271 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008272 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008276 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008278 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 if (x==enc_EXCEPTION) /* error */
8280 goto onError;
8281 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008282 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 &exc,
8284 &known_errorHandler, &errorHandler, errors,
8285 &res, &respos)) {
8286 goto onError;
8287 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008288 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 else
8290 /* done with this character => adjust input position */
8291 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008295 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008296 if (_PyBytes_Resize(&res, respos) < 0)
8297 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 Py_XDECREF(exc);
8300 Py_XDECREF(errorHandler);
8301 return res;
8302
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 Py_XDECREF(res);
8305 Py_XDECREF(exc);
8306 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 return NULL;
8308}
8309
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008310/* Deprecated */
8311PyObject *
8312PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8313 Py_ssize_t size,
8314 PyObject *mapping,
8315 const char *errors)
8316{
8317 PyObject *result;
8318 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8319 if (unicode == NULL)
8320 return NULL;
8321 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8322 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008323 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008324}
8325
Alexander Belopolsky40018472011-02-26 01:02:56 +00008326PyObject *
8327PyUnicode_AsCharmapString(PyObject *unicode,
8328 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329{
8330 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 PyErr_BadArgument();
8332 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008334 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335}
8336
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008337/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008338static void
8339make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008340 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008341 Py_ssize_t startpos, Py_ssize_t endpos,
8342 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008345 *exceptionObject = _PyUnicodeTranslateError_Create(
8346 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 }
8348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8350 goto onError;
8351 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8352 goto onError;
8353 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8354 goto onError;
8355 return;
8356 onError:
8357 Py_DECREF(*exceptionObject);
8358 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359 }
8360}
8361
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363static void
8364raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366 Py_ssize_t startpos, Py_ssize_t endpos,
8367 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368{
8369 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373}
8374
8375/* error handling callback helper:
8376 build arguments, call the callback and check the arguments,
8377 put the result into newpos and return the replacement string, which
8378 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008379static PyObject *
8380unicode_translate_call_errorhandler(const char *errors,
8381 PyObject **errorHandler,
8382 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008384 Py_ssize_t startpos, Py_ssize_t endpos,
8385 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008387 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008388
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008389 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 PyObject *restuple;
8391 PyObject *resunicode;
8392
8393 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008397 }
8398
8399 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403
8404 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008409 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 Py_DECREF(restuple);
8411 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 }
8413 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 &resunicode, &i_newpos)) {
8415 Py_DECREF(restuple);
8416 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008418 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008419 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008420 else
8421 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8424 Py_DECREF(restuple);
8425 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008426 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 Py_INCREF(resunicode);
8428 Py_DECREF(restuple);
8429 return resunicode;
8430}
8431
8432/* Lookup the character ch in the mapping and put the result in result,
8433 which must be decrefed by the caller.
8434 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008435static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008436charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437{
Christian Heimes217cfd12007-12-02 14:31:20 +00008438 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439 PyObject *x;
8440
8441 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 x = PyObject_GetItem(mapping, w);
8444 Py_DECREF(w);
8445 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8447 /* No mapping found means: use 1:1 mapping. */
8448 PyErr_Clear();
8449 *result = NULL;
8450 return 0;
8451 } else
8452 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453 }
8454 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 *result = x;
8456 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008458 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 long value = PyLong_AS_LONG(x);
8460 long max = PyUnicode_GetMax();
8461 if (value < 0 || value > max) {
8462 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008463 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 Py_DECREF(x);
8465 return -1;
8466 }
8467 *result = x;
8468 return 0;
8469 }
8470 else if (PyUnicode_Check(x)) {
8471 *result = x;
8472 return 0;
8473 }
8474 else {
8475 /* wrong return value */
8476 PyErr_SetString(PyExc_TypeError,
8477 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008478 Py_DECREF(x);
8479 return -1;
8480 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008481}
8482/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 if not reallocate and adjust various state variables.
8484 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008485static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008490 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 /* exponentially overallocate to minimize reallocations */
8492 if (requiredsize < 2 * oldsize)
8493 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8495 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008498 }
8499 return 0;
8500}
8501/* lookup the character, put the result in the output string and adjust
8502 various state variables. Return a new reference to the object that
8503 was put in the output buffer in *result, or Py_None, if the mapping was
8504 undefined (in which case no character was written).
8505 The called must decref result.
8506 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008507static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8509 PyObject *mapping, Py_UCS4 **output,
8510 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008511 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8514 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 }
8520 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008522 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008524 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 }
8526 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 Py_ssize_t repsize;
8528 if (PyUnicode_READY(*res) == -1)
8529 return -1;
8530 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 if (repsize==1) {
8532 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 }
8535 else if (repsize!=0) {
8536 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 Py_ssize_t requiredsize = *opos +
8538 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 Py_ssize_t i;
8541 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543 for(i = 0; i < repsize; i++)
8544 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 }
8547 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 return 0;
8550}
8551
Alexander Belopolsky40018472011-02-26 01:02:56 +00008552PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553_PyUnicode_TranslateCharmap(PyObject *input,
8554 PyObject *mapping,
8555 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 /* input object */
8558 char *idata;
8559 Py_ssize_t size, i;
8560 int kind;
8561 /* output buffer */
8562 Py_UCS4 *output = NULL;
8563 Py_ssize_t osize;
8564 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008565 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 char *reason = "character maps to <undefined>";
8568 PyObject *errorHandler = NULL;
8569 PyObject *exc = NULL;
8570 /* the following variable is used for caching string comparisons
8571 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8572 * 3=ignore, 4=xmlcharrefreplace */
8573 int known_errorHandler = -1;
8574
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 PyErr_BadArgument();
8577 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580 if (PyUnicode_READY(input) == -1)
8581 return NULL;
8582 idata = (char*)PyUnicode_DATA(input);
8583 kind = PyUnicode_KIND(input);
8584 size = PyUnicode_GET_LENGTH(input);
8585 i = 0;
8586
8587 if (size == 0) {
8588 Py_INCREF(input);
8589 return input;
8590 }
8591
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 /* allocate enough for a simple 1:1 translation without
8593 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 osize = size;
8595 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8596 opos = 0;
8597 if (output == NULL) {
8598 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 /* try to encode it */
8604 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008605 if (charmaptranslate_output(input, i, mapping,
8606 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 Py_XDECREF(x);
8608 goto onError;
8609 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008610 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 else { /* untranslatable character */
8614 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8615 Py_ssize_t repsize;
8616 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 Py_ssize_t collstart = i;
8620 Py_ssize_t collend = i+1;
8621 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 while (collend < size) {
8625 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 goto onError;
8627 Py_XDECREF(x);
8628 if (x!=Py_None)
8629 break;
8630 ++collend;
8631 }
8632 /* cache callback name lookup
8633 * (if not done yet, i.e. it's the first error) */
8634 if (known_errorHandler==-1) {
8635 if ((errors==NULL) || (!strcmp(errors, "strict")))
8636 known_errorHandler = 1;
8637 else if (!strcmp(errors, "replace"))
8638 known_errorHandler = 2;
8639 else if (!strcmp(errors, "ignore"))
8640 known_errorHandler = 3;
8641 else if (!strcmp(errors, "xmlcharrefreplace"))
8642 known_errorHandler = 4;
8643 else
8644 known_errorHandler = 0;
8645 }
8646 switch (known_errorHandler) {
8647 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 raise_translate_exception(&exc, input, collstart,
8649 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008650 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 case 2: /* replace */
8652 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 for (coll = collstart; coll<collend; coll++)
8654 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 /* fall through */
8656 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 break;
8659 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 /* generate replacement (temporarily (mis)uses i) */
8661 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 char buffer[2+29+1+1];
8663 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8665 if (charmaptranslate_makespace(&output, &osize,
8666 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 goto onError;
8668 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 break;
8673 default:
8674 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 reason, input, &exc,
8676 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008677 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008679 if (PyUnicode_READY(repunicode) < 0) {
8680 Py_DECREF(repunicode);
8681 goto onError;
8682 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 repsize = PyUnicode_GET_LENGTH(repunicode);
8685 if (charmaptranslate_makespace(&output, &osize,
8686 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 Py_DECREF(repunicode);
8688 goto onError;
8689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 for (uni2 = 0; repsize-->0; ++uni2)
8691 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8692 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008695 }
8696 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8698 if (!res)
8699 goto onError;
8700 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 Py_XDECREF(exc);
8702 Py_XDECREF(errorHandler);
8703 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707 Py_XDECREF(exc);
8708 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008709 return NULL;
8710}
8711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712/* Deprecated. Use PyUnicode_Translate instead. */
8713PyObject *
8714PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8715 Py_ssize_t size,
8716 PyObject *mapping,
8717 const char *errors)
8718{
8719 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8720 if (!unicode)
8721 return NULL;
8722 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8723}
8724
Alexander Belopolsky40018472011-02-26 01:02:56 +00008725PyObject *
8726PyUnicode_Translate(PyObject *str,
8727 PyObject *mapping,
8728 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729{
8730 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008731
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732 str = PyUnicode_FromObject(str);
8733 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736 Py_DECREF(str);
8737 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008738
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740 Py_XDECREF(str);
8741 return NULL;
8742}
Tim Petersced69f82003-09-16 20:30:58 +00008743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008745fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746{
8747 /* No need to call PyUnicode_READY(self) because this function is only
8748 called as a callback from fixup() which does it already. */
8749 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8750 const int kind = PyUnicode_KIND(self);
8751 void *data = PyUnicode_DATA(self);
8752 Py_UCS4 maxchar = 0, ch, fixed;
8753 Py_ssize_t i;
8754
8755 for (i = 0; i < len; ++i) {
8756 ch = PyUnicode_READ(kind, data, i);
8757 fixed = 0;
8758 if (ch > 127) {
8759 if (Py_UNICODE_ISSPACE(ch))
8760 fixed = ' ';
8761 else {
8762 const int decimal = Py_UNICODE_TODECIMAL(ch);
8763 if (decimal >= 0)
8764 fixed = '0' + decimal;
8765 }
8766 if (fixed != 0) {
8767 if (fixed > maxchar)
8768 maxchar = fixed;
8769 PyUnicode_WRITE(kind, data, i, fixed);
8770 }
8771 else if (ch > maxchar)
8772 maxchar = ch;
8773 }
8774 else if (ch > maxchar)
8775 maxchar = ch;
8776 }
8777
8778 return maxchar;
8779}
8780
8781PyObject *
8782_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8783{
8784 if (!PyUnicode_Check(unicode)) {
8785 PyErr_BadInternalCall();
8786 return NULL;
8787 }
8788 if (PyUnicode_READY(unicode) == -1)
8789 return NULL;
8790 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8791 /* If the string is already ASCII, just return the same string */
8792 Py_INCREF(unicode);
8793 return unicode;
8794 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008795 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008796}
8797
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008798PyObject *
8799PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8800 Py_ssize_t length)
8801{
Victor Stinnerf0124502011-11-21 23:12:56 +01008802 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008803 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008804 Py_UCS4 maxchar;
8805 enum PyUnicode_Kind kind;
8806 void *data;
8807
8808 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008809 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008810 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008811 if (ch > 127) {
8812 int decimal = Py_UNICODE_TODECIMAL(ch);
8813 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008814 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008815 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008816 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008817 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008818
8819 /* Copy to a new string */
8820 decimal = PyUnicode_New(length, maxchar);
8821 if (decimal == NULL)
8822 return decimal;
8823 kind = PyUnicode_KIND(decimal);
8824 data = PyUnicode_DATA(decimal);
8825 /* Iterate over code points */
8826 for (i = 0; i < length; i++) {
8827 Py_UNICODE ch = s[i];
8828 if (ch > 127) {
8829 int decimal = Py_UNICODE_TODECIMAL(ch);
8830 if (decimal >= 0)
8831 ch = '0' + decimal;
8832 }
8833 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008834 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008835 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008836}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008837/* --- Decimal Encoder ---------------------------------------------------- */
8838
Alexander Belopolsky40018472011-02-26 01:02:56 +00008839int
8840PyUnicode_EncodeDecimal(Py_UNICODE *s,
8841 Py_ssize_t length,
8842 char *output,
8843 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008844{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008845 PyObject *errorHandler = NULL;
8846 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008847 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008848 const char *encoding = "decimal";
8849 const char *reason = "invalid decimal Unicode string";
8850 /* the following variable is used for caching string comparisons
8851 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8852 int known_errorHandler = -1;
Victor Stinner42bf7752011-11-21 22:52:58 +01008853 Py_ssize_t i, j;
8854 enum PyUnicode_Kind kind;
8855 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008856
8857 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008858 PyErr_BadArgument();
8859 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008860 }
8861
Victor Stinner42bf7752011-11-21 22:52:58 +01008862 unicode = PyUnicode_FromUnicode(s, length);
8863 if (unicode == NULL)
8864 return -1;
8865
8866 if (PyUnicode_READY(unicode) < 0)
8867 goto onError;
8868 kind = PyUnicode_KIND(unicode);
8869 data = PyUnicode_DATA(unicode);
8870
Victor Stinnerb84d7232011-11-22 01:50:07 +01008871 for (i=0; i < length; ) {
Victor Stinner42bf7752011-11-21 22:52:58 +01008872 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 int decimal;
Victor Stinner42bf7752011-11-21 22:52:58 +01008874 Py_ssize_t startpos, endpos;
Tim Petersced69f82003-09-16 20:30:58 +00008875
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008877 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008878 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008880 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 decimal = Py_UNICODE_TODECIMAL(ch);
8882 if (decimal >= 0) {
8883 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008884 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008885 continue;
8886 }
8887 if (0 < ch && ch < 256) {
8888 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008889 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 continue;
8891 }
8892 /* All other characters are considered unencodable */
Victor Stinner42bf7752011-11-21 22:52:58 +01008893 startpos = i;
8894 endpos = i+1;
8895 for (; endpos < length; endpos++) {
8896 ch = PyUnicode_READ(kind, data, endpos);
8897 if ((0 < ch && ch < 256) ||
Victor Stinnerb84d7232011-11-22 01:50:07 +01008898 Py_UNICODE_ISSPACE(ch) ||
8899 0 <= Py_UNICODE_TODECIMAL(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 break;
8901 }
8902 /* cache callback name lookup
8903 * (if not done yet, i.e. it's the first error) */
8904 if (known_errorHandler==-1) {
8905 if ((errors==NULL) || (!strcmp(errors, "strict")))
8906 known_errorHandler = 1;
8907 else if (!strcmp(errors, "replace"))
8908 known_errorHandler = 2;
8909 else if (!strcmp(errors, "ignore"))
8910 known_errorHandler = 3;
8911 else if (!strcmp(errors, "xmlcharrefreplace"))
8912 known_errorHandler = 4;
8913 else
8914 known_errorHandler = 0;
8915 }
8916 switch (known_errorHandler) {
8917 case 1: /* strict */
Victor Stinner42bf7752011-11-21 22:52:58 +01008918 raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008919 goto onError;
8920 case 2: /* replace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008921 for (j=startpos; j < endpos; j++)
Benjamin Peterson29060642009-01-31 22:14:21 +00008922 *output++ = '?';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008923 i = endpos;
8924 break;
Benjamin Peterson29060642009-01-31 22:14:21 +00008925 case 3: /* ignore */
Victor Stinner42bf7752011-11-21 22:52:58 +01008926 i = endpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 break;
8928 case 4: /* xmlcharrefreplace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008929 /* generate replacement */
8930 for (j=startpos; j < endpos; j++) {
8931 ch = PyUnicode_READ(kind, data, i);
8932 output += sprintf(output, "&#%d;", (int)ch);
8933 i++;
8934 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 break;
8936 default:
Victor Stinner42bf7752011-11-21 22:52:58 +01008937 {
8938 PyObject *repunicode;
8939 Py_ssize_t repsize, newpos, k;
8940 enum PyUnicode_Kind repkind;
8941 void *repdata;
8942
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008944 encoding, reason, unicode, &exc,
Victor Stinner42bf7752011-11-21 22:52:58 +01008945 startpos, endpos, &newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 if (repunicode == NULL)
8947 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008948 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008949 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008950 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8951 Py_DECREF(repunicode);
8952 goto onError;
8953 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008954 if (PyUnicode_READY(repunicode) < 0) {
8955 Py_DECREF(repunicode);
8956 goto onError;
8957 }
8958 repkind = PyUnicode_KIND(repunicode);
8959 repdata = PyUnicode_DATA(repunicode);
8960
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 /* generate replacement */
8962 repsize = PyUnicode_GET_SIZE(repunicode);
Victor Stinner42bf7752011-11-21 22:52:58 +01008963 for (k=0; k<repsize; k++) {
8964 ch = PyUnicode_READ(repkind, repdata, k);
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 if (Py_UNICODE_ISSPACE(ch))
8966 *output++ = ' ';
8967 else {
8968 decimal = Py_UNICODE_TODECIMAL(ch);
8969 if (decimal >= 0)
8970 *output++ = '0' + decimal;
8971 else if (0 < ch && ch < 256)
8972 *output++ = (char)ch;
8973 else {
8974 Py_DECREF(repunicode);
8975 raise_encode_exception(&exc, encoding,
Victor Stinner42bf7752011-11-21 22:52:58 +01008976 unicode, startpos, endpos,
8977 reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 goto onError;
8979 }
8980 }
8981 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008982 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 Py_DECREF(repunicode);
8984 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008985 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008986 }
8987 /* 0-terminate the output string */
8988 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008989 Py_XDECREF(exc);
8990 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008991 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008992 return 0;
8993
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008995 Py_XDECREF(exc);
8996 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008997 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008998 return -1;
8999}
9000
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001/* --- Helpers ------------------------------------------------------------ */
9002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009004any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 Py_ssize_t start,
9006 Py_ssize_t end)
9007{
9008 int kind1, kind2, kind;
9009 void *buf1, *buf2;
9010 Py_ssize_t len1, len2, result;
9011
9012 kind1 = PyUnicode_KIND(s1);
9013 kind2 = PyUnicode_KIND(s2);
9014 kind = kind1 > kind2 ? kind1 : kind2;
9015 buf1 = PyUnicode_DATA(s1);
9016 buf2 = PyUnicode_DATA(s2);
9017 if (kind1 != kind)
9018 buf1 = _PyUnicode_AsKind(s1, kind);
9019 if (!buf1)
9020 return -2;
9021 if (kind2 != kind)
9022 buf2 = _PyUnicode_AsKind(s2, kind);
9023 if (!buf2) {
9024 if (kind1 != kind) PyMem_Free(buf1);
9025 return -2;
9026 }
9027 len1 = PyUnicode_GET_LENGTH(s1);
9028 len2 = PyUnicode_GET_LENGTH(s2);
9029
Victor Stinner794d5672011-10-10 03:21:36 +02009030 if (direction > 0) {
9031 switch(kind) {
9032 case PyUnicode_1BYTE_KIND:
9033 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9034 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9035 else
9036 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9037 break;
9038 case PyUnicode_2BYTE_KIND:
9039 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9040 break;
9041 case PyUnicode_4BYTE_KIND:
9042 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9043 break;
9044 default:
9045 assert(0); result = -2;
9046 }
9047 }
9048 else {
9049 switch(kind) {
9050 case PyUnicode_1BYTE_KIND:
9051 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9052 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9053 else
9054 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9055 break;
9056 case PyUnicode_2BYTE_KIND:
9057 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9058 break;
9059 case PyUnicode_4BYTE_KIND:
9060 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9061 break;
9062 default:
9063 assert(0); result = -2;
9064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065 }
9066
9067 if (kind1 != kind)
9068 PyMem_Free(buf1);
9069 if (kind2 != kind)
9070 PyMem_Free(buf2);
9071
9072 return result;
9073}
9074
9075Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009076_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 Py_ssize_t n_buffer,
9078 void *digits, Py_ssize_t n_digits,
9079 Py_ssize_t min_width,
9080 const char *grouping,
9081 const char *thousands_sep)
9082{
9083 switch(kind) {
9084 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009085 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9086 return _PyUnicode_ascii_InsertThousandsGrouping(
9087 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9088 min_width, grouping, thousands_sep);
9089 else
9090 return _PyUnicode_ucs1_InsertThousandsGrouping(
9091 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9092 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 case PyUnicode_2BYTE_KIND:
9094 return _PyUnicode_ucs2_InsertThousandsGrouping(
9095 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9096 min_width, grouping, thousands_sep);
9097 case PyUnicode_4BYTE_KIND:
9098 return _PyUnicode_ucs4_InsertThousandsGrouping(
9099 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9100 min_width, grouping, thousands_sep);
9101 }
9102 assert(0);
9103 return -1;
9104}
9105
9106
Thomas Wouters477c8d52006-05-27 19:21:47 +00009107/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009108#define ADJUST_INDICES(start, end, len) \
9109 if (end > len) \
9110 end = len; \
9111 else if (end < 0) { \
9112 end += len; \
9113 if (end < 0) \
9114 end = 0; \
9115 } \
9116 if (start < 0) { \
9117 start += len; \
9118 if (start < 0) \
9119 start = 0; \
9120 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009121
Alexander Belopolsky40018472011-02-26 01:02:56 +00009122Py_ssize_t
9123PyUnicode_Count(PyObject *str,
9124 PyObject *substr,
9125 Py_ssize_t start,
9126 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009128 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009129 PyObject* str_obj;
9130 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 int kind1, kind2, kind;
9132 void *buf1 = NULL, *buf2 = NULL;
9133 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009134
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009135 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009138 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009139 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 Py_DECREF(str_obj);
9141 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 }
Tim Petersced69f82003-09-16 20:30:58 +00009143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 kind1 = PyUnicode_KIND(str_obj);
9145 kind2 = PyUnicode_KIND(sub_obj);
9146 kind = kind1 > kind2 ? kind1 : kind2;
9147 buf1 = PyUnicode_DATA(str_obj);
9148 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009149 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 if (!buf1)
9151 goto onError;
9152 buf2 = PyUnicode_DATA(sub_obj);
9153 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009154 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 if (!buf2)
9156 goto onError;
9157 len1 = PyUnicode_GET_LENGTH(str_obj);
9158 len2 = PyUnicode_GET_LENGTH(sub_obj);
9159
9160 ADJUST_INDICES(start, end, len1);
9161 switch(kind) {
9162 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009163 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9164 result = asciilib_count(
9165 ((Py_UCS1*)buf1) + start, end - start,
9166 buf2, len2, PY_SSIZE_T_MAX
9167 );
9168 else
9169 result = ucs1lib_count(
9170 ((Py_UCS1*)buf1) + start, end - start,
9171 buf2, len2, PY_SSIZE_T_MAX
9172 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 break;
9174 case PyUnicode_2BYTE_KIND:
9175 result = ucs2lib_count(
9176 ((Py_UCS2*)buf1) + start, end - start,
9177 buf2, len2, PY_SSIZE_T_MAX
9178 );
9179 break;
9180 case PyUnicode_4BYTE_KIND:
9181 result = ucs4lib_count(
9182 ((Py_UCS4*)buf1) + start, end - start,
9183 buf2, len2, PY_SSIZE_T_MAX
9184 );
9185 break;
9186 default:
9187 assert(0); result = 0;
9188 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009189
9190 Py_DECREF(sub_obj);
9191 Py_DECREF(str_obj);
9192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 if (kind1 != kind)
9194 PyMem_Free(buf1);
9195 if (kind2 != kind)
9196 PyMem_Free(buf2);
9197
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 onError:
9200 Py_DECREF(sub_obj);
9201 Py_DECREF(str_obj);
9202 if (kind1 != kind && buf1)
9203 PyMem_Free(buf1);
9204 if (kind2 != kind && buf2)
9205 PyMem_Free(buf2);
9206 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207}
9208
Alexander Belopolsky40018472011-02-26 01:02:56 +00009209Py_ssize_t
9210PyUnicode_Find(PyObject *str,
9211 PyObject *sub,
9212 Py_ssize_t start,
9213 Py_ssize_t end,
9214 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009216 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009217
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009221 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009223 Py_DECREF(str);
9224 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 }
Tim Petersced69f82003-09-16 20:30:58 +00009226
Victor Stinner794d5672011-10-10 03:21:36 +02009227 result = any_find_slice(direction,
9228 str, sub, start, end
9229 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009230
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009232 Py_DECREF(sub);
9233
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234 return result;
9235}
9236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237Py_ssize_t
9238PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9239 Py_ssize_t start, Py_ssize_t end,
9240 int direction)
9241{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009243 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 if (PyUnicode_READY(str) == -1)
9245 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009246 if (start < 0 || end < 0) {
9247 PyErr_SetString(PyExc_IndexError, "string index out of range");
9248 return -2;
9249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 if (end > PyUnicode_GET_LENGTH(str))
9251 end = PyUnicode_GET_LENGTH(str);
9252 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009253 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9254 kind, end-start, ch, direction);
9255 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009257 else
9258 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259}
9260
Alexander Belopolsky40018472011-02-26 01:02:56 +00009261static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009262tailmatch(PyObject *self,
9263 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009264 Py_ssize_t start,
9265 Py_ssize_t end,
9266 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009267{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 int kind_self;
9269 int kind_sub;
9270 void *data_self;
9271 void *data_sub;
9272 Py_ssize_t offset;
9273 Py_ssize_t i;
9274 Py_ssize_t end_sub;
9275
9276 if (PyUnicode_READY(self) == -1 ||
9277 PyUnicode_READY(substring) == -1)
9278 return 0;
9279
9280 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281 return 1;
9282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9284 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009286 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 kind_self = PyUnicode_KIND(self);
9289 data_self = PyUnicode_DATA(self);
9290 kind_sub = PyUnicode_KIND(substring);
9291 data_sub = PyUnicode_DATA(substring);
9292 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9293
9294 if (direction > 0)
9295 offset = end;
9296 else
9297 offset = start;
9298
9299 if (PyUnicode_READ(kind_self, data_self, offset) ==
9300 PyUnicode_READ(kind_sub, data_sub, 0) &&
9301 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9302 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9303 /* If both are of the same kind, memcmp is sufficient */
9304 if (kind_self == kind_sub) {
9305 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009306 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 data_sub,
9308 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009309 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 }
9311 /* otherwise we have to compare each character by first accesing it */
9312 else {
9313 /* We do not need to compare 0 and len(substring)-1 because
9314 the if statement above ensured already that they are equal
9315 when we end up here. */
9316 // TODO: honor direction and do a forward or backwards search
9317 for (i = 1; i < end_sub; ++i) {
9318 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9319 PyUnicode_READ(kind_sub, data_sub, i))
9320 return 0;
9321 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009322 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324 }
9325
9326 return 0;
9327}
9328
Alexander Belopolsky40018472011-02-26 01:02:56 +00009329Py_ssize_t
9330PyUnicode_Tailmatch(PyObject *str,
9331 PyObject *substr,
9332 Py_ssize_t start,
9333 Py_ssize_t end,
9334 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009336 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009337
Guido van Rossumd57fd912000-03-10 22:53:23 +00009338 str = PyUnicode_FromObject(str);
9339 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009340 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341 substr = PyUnicode_FromObject(substr);
9342 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 Py_DECREF(str);
9344 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345 }
Tim Petersced69f82003-09-16 20:30:58 +00009346
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009347 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009348 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 Py_DECREF(str);
9350 Py_DECREF(substr);
9351 return result;
9352}
9353
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354/* Apply fixfct filter to the Unicode object self and return a
9355 reference to the modified object */
9356
Alexander Belopolsky40018472011-02-26 01:02:56 +00009357static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009358fixup(PyObject *self,
9359 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 PyObject *u;
9362 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363
Victor Stinner87af4f22011-11-21 23:03:47 +01009364 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009366 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009367 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 /* fix functions return the new maximum character in a string,
9370 if the kind of the resulting unicode object does not change,
9371 everything is fine. Otherwise we need to change the string kind
9372 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009373 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 if (maxchar_new == 0)
9375 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9376 else if (maxchar_new <= 127)
9377 maxchar_new = 127;
9378 else if (maxchar_new <= 255)
9379 maxchar_new = 255;
9380 else if (maxchar_new <= 65535)
9381 maxchar_new = 65535;
9382 else
9383 maxchar_new = 1114111; /* 0x10ffff */
9384
9385 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009386 /* fixfct should return TRUE if it modified the buffer. If
9387 FALSE, return a reference to the original buffer instead
9388 (to save space, not time) */
9389 Py_INCREF(self);
9390 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009391 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 else if (maxchar_new == maxchar_old) {
9394 return u;
9395 }
9396 else {
9397 /* In case the maximum character changed, we need to
9398 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009399 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 if (v == NULL) {
9401 Py_DECREF(u);
9402 return NULL;
9403 }
9404 if (maxchar_new > maxchar_old) {
9405 /* If the maxchar increased so that the kind changed, not all
9406 characters are representable anymore and we need to fix the
9407 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009408 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009409 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9411 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009412 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009413 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415
9416 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009417 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 return v;
9419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420}
9421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009423fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 /* No need to call PyUnicode_READY(self) because this function is only
9426 called as a callback from fixup() which does it already. */
9427 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9428 const int kind = PyUnicode_KIND(self);
9429 void *data = PyUnicode_DATA(self);
9430 int touched = 0;
9431 Py_UCS4 maxchar = 0;
9432 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 for (i = 0; i < len; ++i) {
9435 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9436 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9437 if (up != ch) {
9438 if (up > maxchar)
9439 maxchar = up;
9440 PyUnicode_WRITE(kind, data, i, up);
9441 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 else if (ch > maxchar)
9444 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445 }
9446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 if (touched)
9448 return maxchar;
9449 else
9450 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451}
9452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009454fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9457 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9458 const int kind = PyUnicode_KIND(self);
9459 void *data = PyUnicode_DATA(self);
9460 int touched = 0;
9461 Py_UCS4 maxchar = 0;
9462 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 for(i = 0; i < len; ++i) {
9465 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9466 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9467 if (lo != ch) {
9468 if (lo > maxchar)
9469 maxchar = lo;
9470 PyUnicode_WRITE(kind, data, i, lo);
9471 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 else if (ch > maxchar)
9474 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475 }
9476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 if (touched)
9478 return maxchar;
9479 else
9480 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481}
9482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009484fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9487 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9488 const int kind = PyUnicode_KIND(self);
9489 void *data = PyUnicode_DATA(self);
9490 int touched = 0;
9491 Py_UCS4 maxchar = 0;
9492 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 for(i = 0; i < len; ++i) {
9495 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9496 Py_UCS4 nu = 0;
9497
9498 if (Py_UNICODE_ISUPPER(ch))
9499 nu = Py_UNICODE_TOLOWER(ch);
9500 else if (Py_UNICODE_ISLOWER(ch))
9501 nu = Py_UNICODE_TOUPPER(ch);
9502
9503 if (nu != 0) {
9504 if (nu > maxchar)
9505 maxchar = nu;
9506 PyUnicode_WRITE(kind, data, i, nu);
9507 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 else if (ch > maxchar)
9510 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 }
9512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 if (touched)
9514 return maxchar;
9515 else
9516 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517}
9518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009520fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9523 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9524 const int kind = PyUnicode_KIND(self);
9525 void *data = PyUnicode_DATA(self);
9526 int touched = 0;
9527 Py_UCS4 maxchar = 0;
9528 Py_ssize_t i = 0;
9529 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009530
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009531 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533
9534 ch = PyUnicode_READ(kind, data, i);
9535 if (!Py_UNICODE_ISUPPER(ch)) {
9536 maxchar = Py_UNICODE_TOUPPER(ch);
9537 PyUnicode_WRITE(kind, data, i, maxchar);
9538 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 ++i;
9541 for(; i < len; ++i) {
9542 ch = PyUnicode_READ(kind, data, i);
9543 if (!Py_UNICODE_ISLOWER(ch)) {
9544 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9545 if (lo > maxchar)
9546 maxchar = lo;
9547 PyUnicode_WRITE(kind, data, i, lo);
9548 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 else if (ch > maxchar)
9551 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553
9554 if (touched)
9555 return maxchar;
9556 else
9557 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558}
9559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009561fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9564 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9565 const int kind = PyUnicode_KIND(self);
9566 void *data = PyUnicode_DATA(self);
9567 Py_UCS4 maxchar = 0;
9568 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 int previous_is_cased;
9570
9571 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 if (len == 1) {
9573 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9574 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9575 if (ti != ch) {
9576 PyUnicode_WRITE(kind, data, i, ti);
9577 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009578 }
9579 else
9580 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 for(; i < len; ++i) {
9584 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9585 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009586
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009589 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 nu = Py_UNICODE_TOTITLE(ch);
9591
9592 if (nu > maxchar)
9593 maxchar = nu;
9594 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009595
Benjamin Peterson29060642009-01-31 22:14:21 +00009596 if (Py_UNICODE_ISLOWER(ch) ||
9597 Py_UNICODE_ISUPPER(ch) ||
9598 Py_UNICODE_ISTITLE(ch))
9599 previous_is_cased = 1;
9600 else
9601 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604}
9605
Tim Peters8ce9f162004-08-27 01:49:32 +00009606PyObject *
9607PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009610 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009612 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009613 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9614 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009615 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009617 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009619 int use_memcpy;
9620 unsigned char *res_data = NULL, *sep_data = NULL;
9621 PyObject *last_obj;
9622 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623
Tim Peters05eba1f2004-08-27 21:32:02 +00009624 fseq = PySequence_Fast(seq, "");
9625 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009626 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009627 }
9628
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009629 /* NOTE: the following code can't call back into Python code,
9630 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009631 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009632
Tim Peters05eba1f2004-08-27 21:32:02 +00009633 seqlen = PySequence_Fast_GET_SIZE(fseq);
9634 /* If empty sequence, return u"". */
9635 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009636 Py_DECREF(fseq);
9637 Py_INCREF(unicode_empty);
9638 res = unicode_empty;
9639 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009640 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009641
Tim Peters05eba1f2004-08-27 21:32:02 +00009642 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009643 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009644 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009645 if (seqlen == 1) {
9646 if (PyUnicode_CheckExact(items[0])) {
9647 res = items[0];
9648 Py_INCREF(res);
9649 Py_DECREF(fseq);
9650 return res;
9651 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009652 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009653 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009654 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009655 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009656 /* Set up sep and seplen */
9657 if (separator == NULL) {
9658 /* fall back to a blank space separator */
9659 sep = PyUnicode_FromOrdinal(' ');
9660 if (!sep)
9661 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009662 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009663 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009664 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009665 else {
9666 if (!PyUnicode_Check(separator)) {
9667 PyErr_Format(PyExc_TypeError,
9668 "separator: expected str instance,"
9669 " %.80s found",
9670 Py_TYPE(separator)->tp_name);
9671 goto onError;
9672 }
9673 if (PyUnicode_READY(separator))
9674 goto onError;
9675 sep = separator;
9676 seplen = PyUnicode_GET_LENGTH(separator);
9677 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9678 /* inc refcount to keep this code path symmetric with the
9679 above case of a blank separator */
9680 Py_INCREF(sep);
9681 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009682 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009683 }
9684
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009685 /* There are at least two things to join, or else we have a subclass
9686 * of str in the sequence.
9687 * Do a pre-pass to figure out the total amount of space we'll
9688 * need (sz), and see whether all argument are strings.
9689 */
9690 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009691#ifdef Py_DEBUG
9692 use_memcpy = 0;
9693#else
9694 use_memcpy = 1;
9695#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009696 for (i = 0; i < seqlen; i++) {
9697 const Py_ssize_t old_sz = sz;
9698 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009699 if (!PyUnicode_Check(item)) {
9700 PyErr_Format(PyExc_TypeError,
9701 "sequence item %zd: expected str instance,"
9702 " %.80s found",
9703 i, Py_TYPE(item)->tp_name);
9704 goto onError;
9705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 if (PyUnicode_READY(item) == -1)
9707 goto onError;
9708 sz += PyUnicode_GET_LENGTH(item);
9709 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009710 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009711 if (i != 0)
9712 sz += seplen;
9713 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9714 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009715 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009716 goto onError;
9717 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009718 if (use_memcpy && last_obj != NULL) {
9719 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9720 use_memcpy = 0;
9721 }
9722 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009723 }
Tim Petersced69f82003-09-16 20:30:58 +00009724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009726 if (res == NULL)
9727 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009728
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009729 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009730#ifdef Py_DEBUG
9731 use_memcpy = 0;
9732#else
9733 if (use_memcpy) {
9734 res_data = PyUnicode_1BYTE_DATA(res);
9735 kind = PyUnicode_KIND(res);
9736 if (seplen != 0)
9737 sep_data = PyUnicode_1BYTE_DATA(sep);
9738 }
9739#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009741 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009742 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009743 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009744 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009745 if (use_memcpy) {
9746 Py_MEMCPY(res_data,
9747 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009748 kind * seplen);
9749 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009750 }
9751 else {
9752 copy_characters(res, res_offset, sep, 0, seplen);
9753 res_offset += seplen;
9754 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009755 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009756 itemlen = PyUnicode_GET_LENGTH(item);
9757 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009758 if (use_memcpy) {
9759 Py_MEMCPY(res_data,
9760 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009761 kind * itemlen);
9762 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009763 }
9764 else {
9765 copy_characters(res, res_offset, item, 0, itemlen);
9766 res_offset += itemlen;
9767 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009768 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009769 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009770 if (use_memcpy)
9771 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009772 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009773 else
9774 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009775
Tim Peters05eba1f2004-08-27 21:32:02 +00009776 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009778 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780
Benjamin Peterson29060642009-01-31 22:14:21 +00009781 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009782 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009784 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785 return NULL;
9786}
9787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788#define FILL(kind, data, value, start, length) \
9789 do { \
9790 Py_ssize_t i_ = 0; \
9791 assert(kind != PyUnicode_WCHAR_KIND); \
9792 switch ((kind)) { \
9793 case PyUnicode_1BYTE_KIND: { \
9794 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9795 memset(to_, (unsigned char)value, length); \
9796 break; \
9797 } \
9798 case PyUnicode_2BYTE_KIND: { \
9799 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9800 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9801 break; \
9802 } \
9803 default: { \
9804 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9805 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9806 break; \
9807 } \
9808 } \
9809 } while (0)
9810
Victor Stinner9310abb2011-10-05 00:59:23 +02009811static PyObject *
9812pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009813 Py_ssize_t left,
9814 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 PyObject *u;
9818 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009819 int kind;
9820 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821
9822 if (left < 0)
9823 left = 0;
9824 if (right < 0)
9825 right = 0;
9826
Tim Peters7a29bd52001-09-12 03:03:31 +00009827 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828 Py_INCREF(self);
9829 return self;
9830 }
9831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9833 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009834 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9835 return NULL;
9836 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9838 if (fill > maxchar)
9839 maxchar = fill;
9840 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009841 if (!u)
9842 return NULL;
9843
9844 kind = PyUnicode_KIND(u);
9845 data = PyUnicode_DATA(u);
9846 if (left)
9847 FILL(kind, data, fill, 0, left);
9848 if (right)
9849 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009850 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009851 assert(_PyUnicode_CheckConsistency(u, 1));
9852 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855
Alexander Belopolsky40018472011-02-26 01:02:56 +00009856PyObject *
9857PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860
9861 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009863 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 switch(PyUnicode_KIND(string)) {
9866 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009867 if (PyUnicode_IS_ASCII(string))
9868 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009869 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009870 PyUnicode_GET_LENGTH(string), keepends);
9871 else
9872 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009873 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009874 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 break;
9876 case PyUnicode_2BYTE_KIND:
9877 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009878 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 PyUnicode_GET_LENGTH(string), keepends);
9880 break;
9881 case PyUnicode_4BYTE_KIND:
9882 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009883 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 PyUnicode_GET_LENGTH(string), keepends);
9885 break;
9886 default:
9887 assert(0);
9888 list = 0;
9889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890 Py_DECREF(string);
9891 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892}
9893
Alexander Belopolsky40018472011-02-26 01:02:56 +00009894static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009895split(PyObject *self,
9896 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009897 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 int kind1, kind2, kind;
9900 void *buf1, *buf2;
9901 Py_ssize_t len1, len2;
9902 PyObject* out;
9903
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009905 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 if (PyUnicode_READY(self) == -1)
9908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 if (substring == NULL)
9911 switch(PyUnicode_KIND(self)) {
9912 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009913 if (PyUnicode_IS_ASCII(self))
9914 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009915 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009916 PyUnicode_GET_LENGTH(self), maxcount
9917 );
9918 else
9919 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009920 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009921 PyUnicode_GET_LENGTH(self), maxcount
9922 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 case PyUnicode_2BYTE_KIND:
9924 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009925 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 PyUnicode_GET_LENGTH(self), maxcount
9927 );
9928 case PyUnicode_4BYTE_KIND:
9929 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009930 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 PyUnicode_GET_LENGTH(self), maxcount
9932 );
9933 default:
9934 assert(0);
9935 return NULL;
9936 }
9937
9938 if (PyUnicode_READY(substring) == -1)
9939 return NULL;
9940
9941 kind1 = PyUnicode_KIND(self);
9942 kind2 = PyUnicode_KIND(substring);
9943 kind = kind1 > kind2 ? kind1 : kind2;
9944 buf1 = PyUnicode_DATA(self);
9945 buf2 = PyUnicode_DATA(substring);
9946 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009947 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 if (!buf1)
9949 return NULL;
9950 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009951 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 if (!buf2) {
9953 if (kind1 != kind) PyMem_Free(buf1);
9954 return NULL;
9955 }
9956 len1 = PyUnicode_GET_LENGTH(self);
9957 len2 = PyUnicode_GET_LENGTH(substring);
9958
9959 switch(kind) {
9960 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009961 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9962 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009963 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009964 else
9965 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009966 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 break;
9968 case PyUnicode_2BYTE_KIND:
9969 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009970 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 break;
9972 case PyUnicode_4BYTE_KIND:
9973 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009974 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 break;
9976 default:
9977 out = NULL;
9978 }
9979 if (kind1 != kind)
9980 PyMem_Free(buf1);
9981 if (kind2 != kind)
9982 PyMem_Free(buf2);
9983 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984}
9985
Alexander Belopolsky40018472011-02-26 01:02:56 +00009986static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009987rsplit(PyObject *self,
9988 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009989 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 int kind1, kind2, kind;
9992 void *buf1, *buf2;
9993 Py_ssize_t len1, len2;
9994 PyObject* out;
9995
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009996 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009997 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 if (PyUnicode_READY(self) == -1)
10000 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 if (substring == NULL)
10003 switch(PyUnicode_KIND(self)) {
10004 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010005 if (PyUnicode_IS_ASCII(self))
10006 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010007 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010008 PyUnicode_GET_LENGTH(self), maxcount
10009 );
10010 else
10011 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010012 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010013 PyUnicode_GET_LENGTH(self), maxcount
10014 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 case PyUnicode_2BYTE_KIND:
10016 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010017 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 PyUnicode_GET_LENGTH(self), maxcount
10019 );
10020 case PyUnicode_4BYTE_KIND:
10021 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010022 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 PyUnicode_GET_LENGTH(self), maxcount
10024 );
10025 default:
10026 assert(0);
10027 return NULL;
10028 }
10029
10030 if (PyUnicode_READY(substring) == -1)
10031 return NULL;
10032
10033 kind1 = PyUnicode_KIND(self);
10034 kind2 = PyUnicode_KIND(substring);
10035 kind = kind1 > kind2 ? kind1 : kind2;
10036 buf1 = PyUnicode_DATA(self);
10037 buf2 = PyUnicode_DATA(substring);
10038 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010039 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 if (!buf1)
10041 return NULL;
10042 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010043 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 if (!buf2) {
10045 if (kind1 != kind) PyMem_Free(buf1);
10046 return NULL;
10047 }
10048 len1 = PyUnicode_GET_LENGTH(self);
10049 len2 = PyUnicode_GET_LENGTH(substring);
10050
10051 switch(kind) {
10052 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010053 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10054 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010055 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010056 else
10057 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010058 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 break;
10060 case PyUnicode_2BYTE_KIND:
10061 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010062 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 break;
10064 case PyUnicode_4BYTE_KIND:
10065 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010066 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 break;
10068 default:
10069 out = NULL;
10070 }
10071 if (kind1 != kind)
10072 PyMem_Free(buf1);
10073 if (kind2 != kind)
10074 PyMem_Free(buf2);
10075 return out;
10076}
10077
10078static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010079anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10080 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081{
10082 switch(kind) {
10083 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010084 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10085 return asciilib_find(buf1, len1, buf2, len2, offset);
10086 else
10087 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 case PyUnicode_2BYTE_KIND:
10089 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10090 case PyUnicode_4BYTE_KIND:
10091 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10092 }
10093 assert(0);
10094 return -1;
10095}
10096
10097static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010098anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10099 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100{
10101 switch(kind) {
10102 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010103 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10104 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10105 else
10106 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 case PyUnicode_2BYTE_KIND:
10108 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10109 case PyUnicode_4BYTE_KIND:
10110 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10111 }
10112 assert(0);
10113 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010114}
10115
Alexander Belopolsky40018472011-02-26 01:02:56 +000010116static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117replace(PyObject *self, PyObject *str1,
10118 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 PyObject *u;
10121 char *sbuf = PyUnicode_DATA(self);
10122 char *buf1 = PyUnicode_DATA(str1);
10123 char *buf2 = PyUnicode_DATA(str2);
10124 int srelease = 0, release1 = 0, release2 = 0;
10125 int skind = PyUnicode_KIND(self);
10126 int kind1 = PyUnicode_KIND(str1);
10127 int kind2 = PyUnicode_KIND(str2);
10128 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10129 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10130 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010131 int mayshrink;
10132 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133
10134 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010135 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010137 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138
Victor Stinner59de0ee2011-10-07 10:01:28 +020010139 if (str1 == str2)
10140 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 if (skind < kind1)
10142 /* substring too wide to be present */
10143 goto nothing;
10144
Victor Stinner49a0a212011-10-12 23:46:10 +020010145 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10146 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10147 /* Replacing str1 with str2 may cause a maxchar reduction in the
10148 result string. */
10149 mayshrink = (maxchar_str2 < maxchar);
10150 maxchar = Py_MAX(maxchar, maxchar_str2);
10151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010153 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010154 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010156 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010158 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010159 Py_UCS4 u1, u2;
10160 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010162 if (findchar(sbuf, PyUnicode_KIND(self),
10163 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010164 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010167 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010169 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 rkind = PyUnicode_KIND(u);
10171 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10172 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010173 if (--maxcount < 0)
10174 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010176 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010177 }
10178 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 int rkind = skind;
10180 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (kind1 < rkind) {
10183 /* widen substring */
10184 buf1 = _PyUnicode_AsKind(str1, rkind);
10185 if (!buf1) goto error;
10186 release1 = 1;
10187 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010188 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010189 if (i < 0)
10190 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 if (rkind > kind2) {
10192 /* widen replacement */
10193 buf2 = _PyUnicode_AsKind(str2, rkind);
10194 if (!buf2) goto error;
10195 release2 = 1;
10196 }
10197 else if (rkind < kind2) {
10198 /* widen self and buf1 */
10199 rkind = kind2;
10200 if (release1) PyMem_Free(buf1);
10201 sbuf = _PyUnicode_AsKind(self, rkind);
10202 if (!sbuf) goto error;
10203 srelease = 1;
10204 buf1 = _PyUnicode_AsKind(str1, rkind);
10205 if (!buf1) goto error;
10206 release1 = 1;
10207 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010208 u = PyUnicode_New(slen, maxchar);
10209 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010211 assert(PyUnicode_KIND(u) == rkind);
10212 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010213
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010214 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010215 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010216 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010218 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010220
10221 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010222 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010223 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010225 if (i == -1)
10226 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010227 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010229 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010233 }
10234 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 Py_ssize_t n, i, j, ires;
10236 Py_ssize_t product, new_size;
10237 int rkind = skind;
10238 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010241 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 buf1 = _PyUnicode_AsKind(str1, rkind);
10243 if (!buf1) goto error;
10244 release1 = 1;
10245 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010246 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010247 if (n == 0)
10248 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010250 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 buf2 = _PyUnicode_AsKind(str2, rkind);
10252 if (!buf2) goto error;
10253 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010256 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 rkind = kind2;
10258 sbuf = _PyUnicode_AsKind(self, rkind);
10259 if (!sbuf) goto error;
10260 srelease = 1;
10261 if (release1) PyMem_Free(buf1);
10262 buf1 = _PyUnicode_AsKind(str1, rkind);
10263 if (!buf1) goto error;
10264 release1 = 1;
10265 }
10266 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10267 PyUnicode_GET_LENGTH(str1))); */
10268 product = n * (len2-len1);
10269 if ((product / (len2-len1)) != n) {
10270 PyErr_SetString(PyExc_OverflowError,
10271 "replace string is too long");
10272 goto error;
10273 }
10274 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010275 if (new_size == 0) {
10276 Py_INCREF(unicode_empty);
10277 u = unicode_empty;
10278 goto done;
10279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10281 PyErr_SetString(PyExc_OverflowError,
10282 "replace string is too long");
10283 goto error;
10284 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010285 u = PyUnicode_New(new_size, maxchar);
10286 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010288 assert(PyUnicode_KIND(u) == rkind);
10289 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 ires = i = 0;
10291 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010292 while (n-- > 0) {
10293 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010294 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010295 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010296 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010297 if (j == -1)
10298 break;
10299 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010300 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010301 memcpy(res + rkind * ires,
10302 sbuf + rkind * i,
10303 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010305 }
10306 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010308 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010310 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010316 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010317 memcpy(res + rkind * ires,
10318 sbuf + rkind * i,
10319 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010320 }
10321 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010322 /* interleave */
10323 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010324 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010326 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010328 if (--n <= 0)
10329 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010330 memcpy(res + rkind * ires,
10331 sbuf + rkind * i,
10332 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 ires++;
10334 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010335 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010336 memcpy(res + rkind * ires,
10337 sbuf + rkind * i,
10338 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010339 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010340 }
10341
10342 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010343 unicode_adjust_maxchar(&u);
10344 if (u == NULL)
10345 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010347
10348 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 if (srelease)
10350 PyMem_FREE(sbuf);
10351 if (release1)
10352 PyMem_FREE(buf1);
10353 if (release2)
10354 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010355 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010357
Benjamin Peterson29060642009-01-31 22:14:21 +000010358 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010359 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (srelease)
10361 PyMem_FREE(sbuf);
10362 if (release1)
10363 PyMem_FREE(buf1);
10364 if (release2)
10365 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010366 if (PyUnicode_CheckExact(self)) {
10367 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010368 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010369 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010370 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 error:
10372 if (srelease && sbuf)
10373 PyMem_FREE(sbuf);
10374 if (release1 && buf1)
10375 PyMem_FREE(buf1);
10376 if (release2 && buf2)
10377 PyMem_FREE(buf2);
10378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379}
10380
10381/* --- Unicode Object Methods --------------------------------------------- */
10382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010383PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010384 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385\n\
10386Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010387characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
10389static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010390unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392 return fixup(self, fixtitle);
10393}
10394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010395PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010396 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397\n\
10398Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010399have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400
10401static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010402unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404 return fixup(self, fixcapitalize);
10405}
10406
10407#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010408PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010409 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410\n\
10411Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010412normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413
10414static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010415unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416{
10417 PyObject *list;
10418 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010419 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421 /* Split into words */
10422 list = split(self, NULL, -1);
10423 if (!list)
10424 return NULL;
10425
10426 /* Capitalize each word */
10427 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010428 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010429 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430 if (item == NULL)
10431 goto onError;
10432 Py_DECREF(PyList_GET_ITEM(list, i));
10433 PyList_SET_ITEM(list, i, item);
10434 }
10435
10436 /* Join the words to form a new string */
10437 item = PyUnicode_Join(NULL, list);
10438
Benjamin Peterson29060642009-01-31 22:14:21 +000010439 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010441 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442}
10443#endif
10444
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010445/* Argument converter. Coerces to a single unicode character */
10446
10447static int
10448convert_uc(PyObject *obj, void *addr)
10449{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010451 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010452
Benjamin Peterson14339b62009-01-31 16:36:08 +000010453 uniobj = PyUnicode_FromObject(obj);
10454 if (uniobj == NULL) {
10455 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010456 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010457 return 0;
10458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010460 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010462 Py_DECREF(uniobj);
10463 return 0;
10464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010466 Py_DECREF(uniobj);
10467 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010468}
10469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010470PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010471 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010473Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010474done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475
10476static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010477unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010479 Py_ssize_t marg, left;
10480 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 Py_UCS4 fillchar = ' ';
10482
Victor Stinnere9a29352011-10-01 02:14:59 +020010483 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485
Victor Stinnere9a29352011-10-01 02:14:59 +020010486 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487 return NULL;
10488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010491 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492 }
10493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495 left = marg / 2 + (marg & width & 1);
10496
Victor Stinner9310abb2011-10-05 00:59:23 +020010497 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498}
10499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500/* This function assumes that str1 and str2 are readied by the caller. */
10501
Marc-André Lemburge5034372000-08-08 08:04:29 +000010502static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010503unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 int kind1, kind2;
10506 void *data1, *data2;
10507 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 kind1 = PyUnicode_KIND(str1);
10510 kind2 = PyUnicode_KIND(str2);
10511 data1 = PyUnicode_DATA(str1);
10512 data2 = PyUnicode_DATA(str2);
10513 len1 = PyUnicode_GET_LENGTH(str1);
10514 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 for (i = 0; i < len1 && i < len2; ++i) {
10517 Py_UCS4 c1, c2;
10518 c1 = PyUnicode_READ(kind1, data1, i);
10519 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010520
10521 if (c1 != c2)
10522 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010523 }
10524
10525 return (len1 < len2) ? -1 : (len1 != len2);
10526}
10527
Alexander Belopolsky40018472011-02-26 01:02:56 +000010528int
10529PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10532 if (PyUnicode_READY(left) == -1 ||
10533 PyUnicode_READY(right) == -1)
10534 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010535 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010537 PyErr_Format(PyExc_TypeError,
10538 "Can't compare %.100s and %.100s",
10539 left->ob_type->tp_name,
10540 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541 return -1;
10542}
10543
Martin v. Löwis5b222132007-06-10 09:51:05 +000010544int
10545PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10546{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 Py_ssize_t i;
10548 int kind;
10549 void *data;
10550 Py_UCS4 chr;
10551
Victor Stinner910337b2011-10-03 03:20:16 +020010552 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 if (PyUnicode_READY(uni) == -1)
10554 return -1;
10555 kind = PyUnicode_KIND(uni);
10556 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010557 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10559 if (chr != str[i])
10560 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010561 /* This check keeps Python strings that end in '\0' from comparing equal
10562 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010564 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010565 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010566 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010567 return 0;
10568}
10569
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010570
Benjamin Peterson29060642009-01-31 22:14:21 +000010571#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010572 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010573
Alexander Belopolsky40018472011-02-26 01:02:56 +000010574PyObject *
10575PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010576{
10577 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010578
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010579 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10580 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 if (PyUnicode_READY(left) == -1 ||
10582 PyUnicode_READY(right) == -1)
10583 return NULL;
10584 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10585 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010586 if (op == Py_EQ) {
10587 Py_INCREF(Py_False);
10588 return Py_False;
10589 }
10590 if (op == Py_NE) {
10591 Py_INCREF(Py_True);
10592 return Py_True;
10593 }
10594 }
10595 if (left == right)
10596 result = 0;
10597 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010598 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010599
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010600 /* Convert the return value to a Boolean */
10601 switch (op) {
10602 case Py_EQ:
10603 v = TEST_COND(result == 0);
10604 break;
10605 case Py_NE:
10606 v = TEST_COND(result != 0);
10607 break;
10608 case Py_LE:
10609 v = TEST_COND(result <= 0);
10610 break;
10611 case Py_GE:
10612 v = TEST_COND(result >= 0);
10613 break;
10614 case Py_LT:
10615 v = TEST_COND(result == -1);
10616 break;
10617 case Py_GT:
10618 v = TEST_COND(result == 1);
10619 break;
10620 default:
10621 PyErr_BadArgument();
10622 return NULL;
10623 }
10624 Py_INCREF(v);
10625 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010626 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010627
Brian Curtindfc80e32011-08-10 20:28:54 -050010628 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010629}
10630
Alexander Belopolsky40018472011-02-26 01:02:56 +000010631int
10632PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010633{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010634 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 int kind1, kind2, kind;
10636 void *buf1, *buf2;
10637 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010638 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010639
10640 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010641 sub = PyUnicode_FromObject(element);
10642 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010643 PyErr_Format(PyExc_TypeError,
10644 "'in <string>' requires string as left operand, not %s",
10645 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 if (PyUnicode_READY(sub) == -1)
10649 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010650
Thomas Wouters477c8d52006-05-27 19:21:47 +000010651 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010652 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 Py_DECREF(sub);
10654 return -1;
10655 }
10656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 kind1 = PyUnicode_KIND(str);
10658 kind2 = PyUnicode_KIND(sub);
10659 kind = kind1 > kind2 ? kind1 : kind2;
10660 buf1 = PyUnicode_DATA(str);
10661 buf2 = PyUnicode_DATA(sub);
10662 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010663 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (!buf1) {
10665 Py_DECREF(sub);
10666 return -1;
10667 }
10668 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010669 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 if (!buf2) {
10671 Py_DECREF(sub);
10672 if (kind1 != kind) PyMem_Free(buf1);
10673 return -1;
10674 }
10675 len1 = PyUnicode_GET_LENGTH(str);
10676 len2 = PyUnicode_GET_LENGTH(sub);
10677
10678 switch(kind) {
10679 case PyUnicode_1BYTE_KIND:
10680 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10681 break;
10682 case PyUnicode_2BYTE_KIND:
10683 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10684 break;
10685 case PyUnicode_4BYTE_KIND:
10686 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10687 break;
10688 default:
10689 result = -1;
10690 assert(0);
10691 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010692
10693 Py_DECREF(str);
10694 Py_DECREF(sub);
10695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 if (kind1 != kind)
10697 PyMem_Free(buf1);
10698 if (kind2 != kind)
10699 PyMem_Free(buf2);
10700
Guido van Rossum403d68b2000-03-13 15:55:09 +000010701 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010702}
10703
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704/* Concat to string or Unicode object giving a new Unicode object. */
10705
Alexander Belopolsky40018472011-02-26 01:02:56 +000010706PyObject *
10707PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010710 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711
10712 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719
10720 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010721 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010725 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728 }
10729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010731 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10732 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 w = PyUnicode_New(
10736 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10737 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010739 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010740 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10741 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742 Py_DECREF(u);
10743 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010744 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748 Py_XDECREF(u);
10749 Py_XDECREF(v);
10750 return NULL;
10751}
10752
Victor Stinnerb0923652011-10-04 01:17:31 +020010753static void
10754unicode_append_inplace(PyObject **p_left, PyObject *right)
10755{
10756 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010757
10758 assert(PyUnicode_IS_READY(*p_left));
10759 assert(PyUnicode_IS_READY(right));
10760
10761 left_len = PyUnicode_GET_LENGTH(*p_left);
10762 right_len = PyUnicode_GET_LENGTH(right);
10763 if (left_len > PY_SSIZE_T_MAX - right_len) {
10764 PyErr_SetString(PyExc_OverflowError,
10765 "strings are too large to concat");
10766 goto error;
10767 }
10768 new_len = left_len + right_len;
10769
10770 /* Now we own the last reference to 'left', so we can resize it
10771 * in-place.
10772 */
10773 if (unicode_resize(p_left, new_len) != 0) {
10774 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10775 * deallocated so it cannot be put back into
10776 * 'variable'. The MemoryError is raised when there
10777 * is no value in 'variable', which might (very
10778 * remotely) be a cause of incompatibilities.
10779 */
10780 goto error;
10781 }
10782 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010783 copy_characters(*p_left, left_len, right, 0, right_len);
10784 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010785 return;
10786
10787error:
10788 Py_DECREF(*p_left);
10789 *p_left = NULL;
10790}
10791
Walter Dörwald1ab83302007-05-18 17:15:44 +000010792void
Victor Stinner23e56682011-10-03 03:54:37 +020010793PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010794{
Victor Stinner23e56682011-10-03 03:54:37 +020010795 PyObject *left, *res;
10796
10797 if (p_left == NULL) {
10798 if (!PyErr_Occurred())
10799 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010800 return;
10801 }
Victor Stinner23e56682011-10-03 03:54:37 +020010802 left = *p_left;
10803 if (right == NULL || !PyUnicode_Check(left)) {
10804 if (!PyErr_Occurred())
10805 PyErr_BadInternalCall();
10806 goto error;
10807 }
10808
Victor Stinnere1335c72011-10-04 20:53:03 +020010809 if (PyUnicode_READY(left))
10810 goto error;
10811 if (PyUnicode_READY(right))
10812 goto error;
10813
Victor Stinner23e56682011-10-03 03:54:37 +020010814 if (PyUnicode_CheckExact(left) && left != unicode_empty
10815 && PyUnicode_CheckExact(right) && right != unicode_empty
10816 && unicode_resizable(left)
10817 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10818 || _PyUnicode_WSTR(left) != NULL))
10819 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010820 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10821 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010822 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010823 not so different than duplicating the string. */
10824 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010825 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010826 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010827 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010828 return;
10829 }
10830 }
10831
10832 res = PyUnicode_Concat(left, right);
10833 if (res == NULL)
10834 goto error;
10835 Py_DECREF(left);
10836 *p_left = res;
10837 return;
10838
10839error:
10840 Py_DECREF(*p_left);
10841 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010842}
10843
10844void
10845PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10846{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010847 PyUnicode_Append(pleft, right);
10848 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010849}
10850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010851PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010852 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010854Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010855string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010856interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857
10858static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010859unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010861 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010862 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010863 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 int kind1, kind2, kind;
10866 void *buf1, *buf2;
10867 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868
Jesus Ceaac451502011-04-20 17:09:23 +020010869 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10870 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 kind1 = PyUnicode_KIND(self);
10874 kind2 = PyUnicode_KIND(substring);
10875 kind = kind1 > kind2 ? kind1 : kind2;
10876 buf1 = PyUnicode_DATA(self);
10877 buf2 = PyUnicode_DATA(substring);
10878 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010879 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 if (!buf1) {
10881 Py_DECREF(substring);
10882 return NULL;
10883 }
10884 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010885 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 if (!buf2) {
10887 Py_DECREF(substring);
10888 if (kind1 != kind) PyMem_Free(buf1);
10889 return NULL;
10890 }
10891 len1 = PyUnicode_GET_LENGTH(self);
10892 len2 = PyUnicode_GET_LENGTH(substring);
10893
10894 ADJUST_INDICES(start, end, len1);
10895 switch(kind) {
10896 case PyUnicode_1BYTE_KIND:
10897 iresult = ucs1lib_count(
10898 ((Py_UCS1*)buf1) + start, end - start,
10899 buf2, len2, PY_SSIZE_T_MAX
10900 );
10901 break;
10902 case PyUnicode_2BYTE_KIND:
10903 iresult = ucs2lib_count(
10904 ((Py_UCS2*)buf1) + start, end - start,
10905 buf2, len2, PY_SSIZE_T_MAX
10906 );
10907 break;
10908 case PyUnicode_4BYTE_KIND:
10909 iresult = ucs4lib_count(
10910 ((Py_UCS4*)buf1) + start, end - start,
10911 buf2, len2, PY_SSIZE_T_MAX
10912 );
10913 break;
10914 default:
10915 assert(0); iresult = 0;
10916 }
10917
10918 result = PyLong_FromSsize_t(iresult);
10919
10920 if (kind1 != kind)
10921 PyMem_Free(buf1);
10922 if (kind2 != kind)
10923 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924
10925 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010926
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927 return result;
10928}
10929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010930PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010931 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010933Encode S using the codec registered for encoding. Default encoding\n\
10934is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010935handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010936a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10937'xmlcharrefreplace' as well as any other name registered with\n\
10938codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939
10940static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010941unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010943 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944 char *encoding = NULL;
10945 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010946
Benjamin Peterson308d6372009-09-18 21:42:35 +000010947 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10948 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010950 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010951}
10952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010953PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010954 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955\n\
10956Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010957If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
10959static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010960unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010962 Py_ssize_t i, j, line_pos, src_len, incr;
10963 Py_UCS4 ch;
10964 PyObject *u;
10965 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010967 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010968 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969
10970 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010971 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972
Antoine Pitrou22425222011-10-04 19:10:51 +020010973 if (PyUnicode_READY(self) == -1)
10974 return NULL;
10975
Thomas Wouters7e474022000-07-16 12:04:32 +000010976 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010977 src_len = PyUnicode_GET_LENGTH(self);
10978 i = j = line_pos = 0;
10979 kind = PyUnicode_KIND(self);
10980 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010981 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010982 for (; i < src_len; i++) {
10983 ch = PyUnicode_READ(kind, src_data, i);
10984 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010985 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010987 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010988 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010989 goto overflow;
10990 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010992 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010996 goto overflow;
10997 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010999 if (ch == '\n' || ch == '\r')
11000 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011002 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011003 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010011004 Py_INCREF(self);
11005 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011006 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011007
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011009 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010 if (!u)
11011 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011012 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013
Antoine Pitroue71d5742011-10-04 15:55:09 +020011014 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
Antoine Pitroue71d5742011-10-04 15:55:09 +020011016 for (; i < src_len; i++) {
11017 ch = PyUnicode_READ(kind, src_data, i);
11018 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011019 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011020 incr = tabsize - (line_pos % tabsize);
11021 line_pos += incr;
11022 while (incr--) {
11023 PyUnicode_WRITE(kind, dest_data, j, ' ');
11024 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011025 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011027 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011029 line_pos++;
11030 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011031 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011032 if (ch == '\n' || ch == '\r')
11033 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011035 }
11036 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011037 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011038
Antoine Pitroue71d5742011-10-04 15:55:09 +020011039 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011040 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11041 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042}
11043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011044PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011045 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046\n\
11047Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011048such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049arguments start and end are interpreted as in slice notation.\n\
11050\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011051Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052
11053static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011056 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011057 Py_ssize_t start;
11058 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011059 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060
Jesus Ceaac451502011-04-20 17:09:23 +020011061 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11062 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 if (PyUnicode_READY(self) == -1)
11066 return NULL;
11067 if (PyUnicode_READY(substring) == -1)
11068 return NULL;
11069
Victor Stinner7931d9a2011-11-04 00:22:48 +010011070 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071
11072 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 if (result == -2)
11075 return NULL;
11076
Christian Heimes217cfd12007-12-02 14:31:20 +000011077 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078}
11079
11080static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011081unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011083 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11084 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087}
11088
Guido van Rossumc2504932007-09-18 19:42:40 +000011089/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011090 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011091static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011092unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093{
Guido van Rossumc2504932007-09-18 19:42:40 +000011094 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011095 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 if (_PyUnicode_HASH(self) != -1)
11098 return _PyUnicode_HASH(self);
11099 if (PyUnicode_READY(self) == -1)
11100 return -1;
11101 len = PyUnicode_GET_LENGTH(self);
11102
11103 /* The hash function as a macro, gets expanded three times below. */
11104#define HASH(P) \
11105 x = (Py_uhash_t)*P << 7; \
11106 while (--len >= 0) \
11107 x = (1000003*x) ^ (Py_uhash_t)*P++;
11108
11109 switch (PyUnicode_KIND(self)) {
11110 case PyUnicode_1BYTE_KIND: {
11111 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11112 HASH(c);
11113 break;
11114 }
11115 case PyUnicode_2BYTE_KIND: {
11116 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11117 HASH(s);
11118 break;
11119 }
11120 default: {
11121 Py_UCS4 *l;
11122 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11123 "Impossible switch case in unicode_hash");
11124 l = PyUnicode_4BYTE_DATA(self);
11125 HASH(l);
11126 break;
11127 }
11128 }
11129 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11130
Guido van Rossumc2504932007-09-18 19:42:40 +000011131 if (x == -1)
11132 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011134 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011138PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011141Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142
11143static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011146 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011147 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011148 Py_ssize_t start;
11149 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150
Jesus Ceaac451502011-04-20 17:09:23 +020011151 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11152 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 if (PyUnicode_READY(self) == -1)
11156 return NULL;
11157 if (PyUnicode_READY(substring) == -1)
11158 return NULL;
11159
Victor Stinner7931d9a2011-11-04 00:22:48 +010011160 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161
11162 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 if (result == -2)
11165 return NULL;
11166
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167 if (result < 0) {
11168 PyErr_SetString(PyExc_ValueError, "substring not found");
11169 return NULL;
11170 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011171
Christian Heimes217cfd12007-12-02 14:31:20 +000011172 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173}
11174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011175PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011176 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011178Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011179at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
11181static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011182unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 Py_ssize_t i, length;
11185 int kind;
11186 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187 int cased;
11188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 if (PyUnicode_READY(self) == -1)
11190 return NULL;
11191 length = PyUnicode_GET_LENGTH(self);
11192 kind = PyUnicode_KIND(self);
11193 data = PyUnicode_DATA(self);
11194
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (length == 1)
11197 return PyBool_FromLong(
11198 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011200 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011202 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011203
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 for (i = 0; i < length; i++) {
11206 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011207
Benjamin Peterson29060642009-01-31 22:14:21 +000011208 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11209 return PyBool_FromLong(0);
11210 else if (!cased && Py_UNICODE_ISLOWER(ch))
11211 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011213 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214}
11215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011216PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011219Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011220at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221
11222static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011223unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 Py_ssize_t i, length;
11226 int kind;
11227 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 int cased;
11229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 if (PyUnicode_READY(self) == -1)
11231 return NULL;
11232 length = PyUnicode_GET_LENGTH(self);
11233 kind = PyUnicode_KIND(self);
11234 data = PyUnicode_DATA(self);
11235
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 if (length == 1)
11238 return PyBool_FromLong(
11239 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011241 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011243 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011244
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 for (i = 0; i < length; i++) {
11247 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011248
Benjamin Peterson29060642009-01-31 22:14:21 +000011249 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11250 return PyBool_FromLong(0);
11251 else if (!cased && Py_UNICODE_ISUPPER(ch))
11252 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011254 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255}
11256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011257PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011260Return True if S is a titlecased string and there is at least one\n\
11261character in S, i.e. upper- and titlecase characters may only\n\
11262follow uncased characters and lowercase characters only cased ones.\n\
11263Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264
11265static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011266unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 Py_ssize_t i, length;
11269 int kind;
11270 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271 int cased, previous_is_cased;
11272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 if (PyUnicode_READY(self) == -1)
11274 return NULL;
11275 length = PyUnicode_GET_LENGTH(self);
11276 kind = PyUnicode_KIND(self);
11277 data = PyUnicode_DATA(self);
11278
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 if (length == 1) {
11281 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11282 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11283 (Py_UNICODE_ISUPPER(ch) != 0));
11284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011286 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011288 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011289
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290 cased = 0;
11291 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292 for (i = 0; i < length; i++) {
11293 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011294
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11296 if (previous_is_cased)
11297 return PyBool_FromLong(0);
11298 previous_is_cased = 1;
11299 cased = 1;
11300 }
11301 else if (Py_UNICODE_ISLOWER(ch)) {
11302 if (!previous_is_cased)
11303 return PyBool_FromLong(0);
11304 previous_is_cased = 1;
11305 cased = 1;
11306 }
11307 else
11308 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011310 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311}
11312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011313PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011314 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011316Return True if all characters in S are whitespace\n\
11317and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318
11319static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011320unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 Py_ssize_t i, length;
11323 int kind;
11324 void *data;
11325
11326 if (PyUnicode_READY(self) == -1)
11327 return NULL;
11328 length = PyUnicode_GET_LENGTH(self);
11329 kind = PyUnicode_KIND(self);
11330 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 if (length == 1)
11334 return PyBool_FromLong(
11335 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011337 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011339 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 for (i = 0; i < length; i++) {
11342 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011343 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011346 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347}
11348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011351\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011352Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011354
11355static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011356unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 Py_ssize_t i, length;
11359 int kind;
11360 void *data;
11361
11362 if (PyUnicode_READY(self) == -1)
11363 return NULL;
11364 length = PyUnicode_GET_LENGTH(self);
11365 kind = PyUnicode_KIND(self);
11366 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011367
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011368 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (length == 1)
11370 return PyBool_FromLong(
11371 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011372
11373 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 for (i = 0; i < length; i++) {
11378 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011380 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011381 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011382}
11383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011384PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011386\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011387Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011388and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389
11390static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011391unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 int kind;
11394 void *data;
11395 Py_ssize_t len, i;
11396
11397 if (PyUnicode_READY(self) == -1)
11398 return NULL;
11399
11400 kind = PyUnicode_KIND(self);
11401 data = PyUnicode_DATA(self);
11402 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011403
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011404 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 if (len == 1) {
11406 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11407 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11408 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011409
11410 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011412 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 for (i = 0; i < len; i++) {
11415 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011416 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011417 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011418 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011419 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011420}
11421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011422PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011425Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011426False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427
11428static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011429unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 Py_ssize_t i, length;
11432 int kind;
11433 void *data;
11434
11435 if (PyUnicode_READY(self) == -1)
11436 return NULL;
11437 length = PyUnicode_GET_LENGTH(self);
11438 kind = PyUnicode_KIND(self);
11439 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 if (length == 1)
11443 return PyBool_FromLong(
11444 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011446 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 for (i = 0; i < length; i++) {
11451 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011454 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455}
11456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011457PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011460Return True if all characters in S are digits\n\
11461and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
11463static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011464unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 Py_ssize_t i, length;
11467 int kind;
11468 void *data;
11469
11470 if (PyUnicode_READY(self) == -1)
11471 return NULL;
11472 length = PyUnicode_GET_LENGTH(self);
11473 kind = PyUnicode_KIND(self);
11474 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 if (length == 1) {
11478 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11479 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011482 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 for (i = 0; i < length; i++) {
11487 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011490 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491}
11492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011493PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011496Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011497False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
11499static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011500unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 Py_ssize_t i, length;
11503 int kind;
11504 void *data;
11505
11506 if (PyUnicode_READY(self) == -1)
11507 return NULL;
11508 length = PyUnicode_GET_LENGTH(self);
11509 kind = PyUnicode_KIND(self);
11510 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 if (length == 1)
11514 return PyBool_FromLong(
11515 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011517 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 for (i = 0; i < length; i++) {
11522 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011525 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526}
11527
Martin v. Löwis47383402007-08-15 07:32:56 +000011528int
11529PyUnicode_IsIdentifier(PyObject *self)
11530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 int kind;
11532 void *data;
11533 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011534 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 if (PyUnicode_READY(self) == -1) {
11537 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 }
11540
11541 /* Special case for empty strings */
11542 if (PyUnicode_GET_LENGTH(self) == 0)
11543 return 0;
11544 kind = PyUnicode_KIND(self);
11545 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011546
11547 /* PEP 3131 says that the first character must be in
11548 XID_Start and subsequent characters in XID_Continue,
11549 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011550 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011551 letters, digits, underscore). However, given the current
11552 definition of XID_Start and XID_Continue, it is sufficient
11553 to check just for these, except that _ must be allowed
11554 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011556 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011557 return 0;
11558
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011559 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011562 return 1;
11563}
11564
11565PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011566 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011567\n\
11568Return True if S is a valid identifier according\n\
11569to the language definition.");
11570
11571static PyObject*
11572unicode_isidentifier(PyObject *self)
11573{
11574 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11575}
11576
Georg Brandl559e5d72008-06-11 18:37:52 +000011577PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011579\n\
11580Return True if all characters in S are considered\n\
11581printable in repr() or S is empty, False otherwise.");
11582
11583static PyObject*
11584unicode_isprintable(PyObject *self)
11585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 Py_ssize_t i, length;
11587 int kind;
11588 void *data;
11589
11590 if (PyUnicode_READY(self) == -1)
11591 return NULL;
11592 length = PyUnicode_GET_LENGTH(self);
11593 kind = PyUnicode_KIND(self);
11594 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011595
11596 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011597 if (length == 1)
11598 return PyBool_FromLong(
11599 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 for (i = 0; i < length; i++) {
11602 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011603 Py_RETURN_FALSE;
11604 }
11605 }
11606 Py_RETURN_TRUE;
11607}
11608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011609PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011610 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611\n\
11612Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011613iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614
11615static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011616unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011618 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619}
11620
Martin v. Löwis18e16552006-02-15 17:27:45 +000011621static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011622unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 if (PyUnicode_READY(self) == -1)
11625 return -1;
11626 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627}
11628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011629PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011630 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011632Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011633done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634
11635static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011636unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011638 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 Py_UCS4 fillchar = ' ';
11640
11641 if (PyUnicode_READY(self) == -1)
11642 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011643
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011644 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645 return NULL;
11646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011649 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650 }
11651
Victor Stinner7931d9a2011-11-04 00:22:48 +010011652 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653}
11654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011655PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011658Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659
11660static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011661unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663 return fixup(self, fixlower);
11664}
11665
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011666#define LEFTSTRIP 0
11667#define RIGHTSTRIP 1
11668#define BOTHSTRIP 2
11669
11670/* Arrays indexed by above */
11671static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11672
11673#define STRIPNAME(i) (stripformat[i]+3)
11674
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011675/* externally visible for str.strip(unicode) */
11676PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011677_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 void *data;
11680 int kind;
11681 Py_ssize_t i, j, len;
11682 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11685 return NULL;
11686
11687 kind = PyUnicode_KIND(self);
11688 data = PyUnicode_DATA(self);
11689 len = PyUnicode_GET_LENGTH(self);
11690 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11691 PyUnicode_DATA(sepobj),
11692 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011693
Benjamin Peterson14339b62009-01-31 16:36:08 +000011694 i = 0;
11695 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 while (i < len &&
11697 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011698 i++;
11699 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011700 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011701
Benjamin Peterson14339b62009-01-31 16:36:08 +000011702 j = len;
11703 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011704 do {
11705 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 } while (j >= i &&
11707 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011709 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011710
Victor Stinner7931d9a2011-11-04 00:22:48 +010011711 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712}
11713
11714PyObject*
11715PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11716{
11717 unsigned char *data;
11718 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011719 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720
Victor Stinnerde636f32011-10-01 03:55:54 +020011721 if (PyUnicode_READY(self) == -1)
11722 return NULL;
11723
11724 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11725
Victor Stinner12bab6d2011-10-01 01:53:49 +020011726 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011728 if (PyUnicode_CheckExact(self)) {
11729 Py_INCREF(self);
11730 return self;
11731 }
11732 else
11733 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 }
11735
Victor Stinner12bab6d2011-10-01 01:53:49 +020011736 length = end - start;
11737 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011738 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739
Victor Stinnerde636f32011-10-01 03:55:54 +020011740 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011741 PyErr_SetString(PyExc_IndexError, "string index out of range");
11742 return NULL;
11743 }
11744
Victor Stinnerb9275c12011-10-05 14:01:42 +020011745 if (PyUnicode_IS_ASCII(self)) {
11746 kind = PyUnicode_KIND(self);
11747 data = PyUnicode_1BYTE_DATA(self);
11748 return unicode_fromascii(data + start, length);
11749 }
11750 else {
11751 kind = PyUnicode_KIND(self);
11752 data = PyUnicode_1BYTE_DATA(self);
11753 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011754 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011755 length);
11756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758
11759static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011760do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 int kind;
11763 void *data;
11764 Py_ssize_t len, i, j;
11765
11766 if (PyUnicode_READY(self) == -1)
11767 return NULL;
11768
11769 kind = PyUnicode_KIND(self);
11770 data = PyUnicode_DATA(self);
11771 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011772
Benjamin Peterson14339b62009-01-31 16:36:08 +000011773 i = 0;
11774 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011776 i++;
11777 }
11778 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011779
Benjamin Peterson14339b62009-01-31 16:36:08 +000011780 j = len;
11781 if (striptype != LEFTSTRIP) {
11782 do {
11783 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011785 j++;
11786 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787
Victor Stinner7931d9a2011-11-04 00:22:48 +010011788 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789}
11790
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011791
11792static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011793do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011794{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011795 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011796
Benjamin Peterson14339b62009-01-31 16:36:08 +000011797 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11798 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011799
Benjamin Peterson14339b62009-01-31 16:36:08 +000011800 if (sep != NULL && sep != Py_None) {
11801 if (PyUnicode_Check(sep))
11802 return _PyUnicode_XStrip(self, striptype, sep);
11803 else {
11804 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011805 "%s arg must be None or str",
11806 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011807 return NULL;
11808 }
11809 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011810
Benjamin Peterson14339b62009-01-31 16:36:08 +000011811 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011812}
11813
11814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011815PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011817\n\
11818Return a copy of the string S with leading and trailing\n\
11819whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011820If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011821
11822static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011823unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011824{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011825 if (PyTuple_GET_SIZE(args) == 0)
11826 return do_strip(self, BOTHSTRIP); /* Common case */
11827 else
11828 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011829}
11830
11831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011832PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011834\n\
11835Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011836If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011837
11838static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011839unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011840{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011841 if (PyTuple_GET_SIZE(args) == 0)
11842 return do_strip(self, LEFTSTRIP); /* Common case */
11843 else
11844 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011845}
11846
11847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011848PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011850\n\
11851Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011852If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011853
11854static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011855unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011856{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011857 if (PyTuple_GET_SIZE(args) == 0)
11858 return do_strip(self, RIGHTSTRIP); /* Common case */
11859 else
11860 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011861}
11862
11863
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011865unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011867 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869
Georg Brandl222de0f2009-04-12 12:01:50 +000011870 if (len < 1) {
11871 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011872 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011873 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874
Tim Peters7a29bd52001-09-12 03:03:31 +000011875 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 /* no repeat, return original string */
11877 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011878 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879 }
Tim Peters8f422462000-09-09 06:13:41 +000011880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (PyUnicode_READY(str) == -1)
11882 return NULL;
11883
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011884 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011885 PyErr_SetString(PyExc_OverflowError,
11886 "repeated string is too long");
11887 return NULL;
11888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011890
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011891 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892 if (!u)
11893 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011894 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 if (PyUnicode_GET_LENGTH(str) == 1) {
11897 const int kind = PyUnicode_KIND(str);
11898 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11899 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011900 if (kind == PyUnicode_1BYTE_KIND)
11901 memset(to, (unsigned char)fill_char, len);
11902 else {
11903 for (n = 0; n < len; ++n)
11904 PyUnicode_WRITE(kind, to, n, fill_char);
11905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 }
11907 else {
11908 /* number of characters copied this far */
11909 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011910 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 char *to = (char *) PyUnicode_DATA(u);
11912 Py_MEMCPY(to, PyUnicode_DATA(str),
11913 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 n = (done <= nchars-done) ? done : nchars-done;
11916 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011917 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 }
11920
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011921 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011922 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923}
11924
Alexander Belopolsky40018472011-02-26 01:02:56 +000011925PyObject *
11926PyUnicode_Replace(PyObject *obj,
11927 PyObject *subobj,
11928 PyObject *replobj,
11929 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930{
11931 PyObject *self;
11932 PyObject *str1;
11933 PyObject *str2;
11934 PyObject *result;
11935
11936 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011937 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011940 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 Py_DECREF(self);
11942 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 }
11944 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011945 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 Py_DECREF(self);
11947 Py_DECREF(str1);
11948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951 Py_DECREF(self);
11952 Py_DECREF(str1);
11953 Py_DECREF(str2);
11954 return result;
11955}
11956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011957PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011958 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959\n\
11960Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011961old replaced by new. If the optional argument count is\n\
11962given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963
11964static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 PyObject *str1;
11968 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011969 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970 PyObject *result;
11971
Martin v. Löwis18e16552006-02-15 17:27:45 +000011972 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 str1 = PyUnicode_FromObject(str1);
11977 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11978 return NULL;
11979 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011980 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 Py_DECREF(str1);
11982 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984
11985 result = replace(self, str1, str2, maxcount);
11986
11987 Py_DECREF(str1);
11988 Py_DECREF(str2);
11989 return result;
11990}
11991
Alexander Belopolsky40018472011-02-26 01:02:56 +000011992static PyObject *
11993unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011995 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 Py_ssize_t isize;
11997 Py_ssize_t osize, squote, dquote, i, o;
11998 Py_UCS4 max, quote;
11999 int ikind, okind;
12000 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012003 return NULL;
12004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 isize = PyUnicode_GET_LENGTH(unicode);
12006 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 /* Compute length of output, quote characters, and
12009 maximum character */
12010 osize = 2; /* quotes */
12011 max = 127;
12012 squote = dquote = 0;
12013 ikind = PyUnicode_KIND(unicode);
12014 for (i = 0; i < isize; i++) {
12015 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12016 switch (ch) {
12017 case '\'': squote++; osize++; break;
12018 case '"': dquote++; osize++; break;
12019 case '\\': case '\t': case '\r': case '\n':
12020 osize += 2; break;
12021 default:
12022 /* Fast-path ASCII */
12023 if (ch < ' ' || ch == 0x7f)
12024 osize += 4; /* \xHH */
12025 else if (ch < 0x7f)
12026 osize++;
12027 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12028 osize++;
12029 max = ch > max ? ch : max;
12030 }
12031 else if (ch < 0x100)
12032 osize += 4; /* \xHH */
12033 else if (ch < 0x10000)
12034 osize += 6; /* \uHHHH */
12035 else
12036 osize += 10; /* \uHHHHHHHH */
12037 }
12038 }
12039
12040 quote = '\'';
12041 if (squote) {
12042 if (dquote)
12043 /* Both squote and dquote present. Use squote,
12044 and escape them */
12045 osize += squote;
12046 else
12047 quote = '"';
12048 }
12049
12050 repr = PyUnicode_New(osize, max);
12051 if (repr == NULL)
12052 return NULL;
12053 okind = PyUnicode_KIND(repr);
12054 odata = PyUnicode_DATA(repr);
12055
12056 PyUnicode_WRITE(okind, odata, 0, quote);
12057 PyUnicode_WRITE(okind, odata, osize-1, quote);
12058
12059 for (i = 0, o = 1; i < isize; i++) {
12060 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012061
12062 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 if ((ch == quote) || (ch == '\\')) {
12064 PyUnicode_WRITE(okind, odata, o++, '\\');
12065 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012066 continue;
12067 }
12068
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012070 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 PyUnicode_WRITE(okind, odata, o++, '\\');
12072 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012073 }
12074 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 PyUnicode_WRITE(okind, odata, o++, '\\');
12076 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012077 }
12078 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 PyUnicode_WRITE(okind, odata, o++, '\\');
12080 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012081 }
12082
12083 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012084 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 PyUnicode_WRITE(okind, odata, o++, '\\');
12086 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012087 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12088 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012089 }
12090
Georg Brandl559e5d72008-06-11 18:37:52 +000012091 /* Copy ASCII characters as-is */
12092 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012094 }
12095
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012097 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012098 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012099 (categories Z* and C* except ASCII space)
12100 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012102 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 if (ch <= 0xff) {
12104 PyUnicode_WRITE(okind, odata, o++, '\\');
12105 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012106 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12107 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012108 }
12109 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 else if (ch >= 0x10000) {
12111 PyUnicode_WRITE(okind, odata, o++, '\\');
12112 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012113 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12114 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12115 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12116 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12117 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12118 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12119 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12120 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012121 }
12122 /* Map 16-bit characters to '\uxxxx' */
12123 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 PyUnicode_WRITE(okind, odata, o++, '\\');
12125 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012126 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12127 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12128 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12129 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012130 }
12131 }
12132 /* Copy characters as-is */
12133 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012135 }
12136 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012139 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012140 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141}
12142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012143PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012144 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145\n\
12146Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012147such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148arguments start and end are interpreted as in slice notation.\n\
12149\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012150Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151
12152static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012155 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012156 Py_ssize_t start;
12157 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012158 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
Jesus Ceaac451502011-04-20 17:09:23 +020012160 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12161 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 if (PyUnicode_READY(self) == -1)
12165 return NULL;
12166 if (PyUnicode_READY(substring) == -1)
12167 return NULL;
12168
Victor Stinner7931d9a2011-11-04 00:22:48 +010012169 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170
12171 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 if (result == -2)
12174 return NULL;
12175
Christian Heimes217cfd12007-12-02 14:31:20 +000012176 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177}
12178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012179PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012182Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183
12184static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012187 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012188 Py_ssize_t start;
12189 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012190 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191
Jesus Ceaac451502011-04-20 17:09:23 +020012192 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12193 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012194 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 if (PyUnicode_READY(self) == -1)
12197 return NULL;
12198 if (PyUnicode_READY(substring) == -1)
12199 return NULL;
12200
Victor Stinner7931d9a2011-11-04 00:22:48 +010012201 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202
12203 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 if (result == -2)
12206 return NULL;
12207
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208 if (result < 0) {
12209 PyErr_SetString(PyExc_ValueError, "substring not found");
12210 return NULL;
12211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212
Christian Heimes217cfd12007-12-02 14:31:20 +000012213 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214}
12215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012216PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012217 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012219Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012220done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221
12222static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012223unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012225 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 Py_UCS4 fillchar = ' ';
12227
Victor Stinnere9a29352011-10-01 02:14:59 +020012228 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012230
Victor Stinnere9a29352011-10-01 02:14:59 +020012231 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 return NULL;
12233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012236 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237 }
12238
Victor Stinner7931d9a2011-11-04 00:22:48 +010012239 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240}
12241
Alexander Belopolsky40018472011-02-26 01:02:56 +000012242PyObject *
12243PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244{
12245 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012246
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247 s = PyUnicode_FromObject(s);
12248 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012249 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 if (sep != NULL) {
12251 sep = PyUnicode_FromObject(sep);
12252 if (sep == NULL) {
12253 Py_DECREF(s);
12254 return NULL;
12255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 }
12257
Victor Stinner9310abb2011-10-05 00:59:23 +020012258 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259
12260 Py_DECREF(s);
12261 Py_XDECREF(sep);
12262 return result;
12263}
12264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012265PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012266 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267\n\
12268Return a list of the words in S, using sep as the\n\
12269delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012270splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012271whitespace string is a separator and empty strings are\n\
12272removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273
12274static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012275unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276{
12277 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012278 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279
Martin v. Löwis18e16552006-02-15 17:27:45 +000012280 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281 return NULL;
12282
12283 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012284 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012286 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012288 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289}
12290
Thomas Wouters477c8d52006-05-27 19:21:47 +000012291PyObject *
12292PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12293{
12294 PyObject* str_obj;
12295 PyObject* sep_obj;
12296 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 int kind1, kind2, kind;
12298 void *buf1 = NULL, *buf2 = NULL;
12299 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012300
12301 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012302 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012304 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306 Py_DECREF(str_obj);
12307 return NULL;
12308 }
12309
Victor Stinner14f8f022011-10-05 20:58:25 +020012310 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012312 kind = Py_MAX(kind1, kind2);
12313 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012315 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 if (!buf1)
12317 goto onError;
12318 buf2 = PyUnicode_DATA(sep_obj);
12319 if (kind2 != kind)
12320 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12321 if (!buf2)
12322 goto onError;
12323 len1 = PyUnicode_GET_LENGTH(str_obj);
12324 len2 = PyUnicode_GET_LENGTH(sep_obj);
12325
Victor Stinner14f8f022011-10-05 20:58:25 +020012326 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012328 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12329 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12330 else
12331 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 break;
12333 case PyUnicode_2BYTE_KIND:
12334 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12335 break;
12336 case PyUnicode_4BYTE_KIND:
12337 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12338 break;
12339 default:
12340 assert(0);
12341 out = 0;
12342 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012343
12344 Py_DECREF(sep_obj);
12345 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 if (kind1 != kind)
12347 PyMem_Free(buf1);
12348 if (kind2 != kind)
12349 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012350
12351 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 onError:
12353 Py_DECREF(sep_obj);
12354 Py_DECREF(str_obj);
12355 if (kind1 != kind && buf1)
12356 PyMem_Free(buf1);
12357 if (kind2 != kind && buf2)
12358 PyMem_Free(buf2);
12359 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012360}
12361
12362
12363PyObject *
12364PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12365{
12366 PyObject* str_obj;
12367 PyObject* sep_obj;
12368 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 int kind1, kind2, kind;
12370 void *buf1 = NULL, *buf2 = NULL;
12371 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012372
12373 str_obj = PyUnicode_FromObject(str_in);
12374 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012376 sep_obj = PyUnicode_FromObject(sep_in);
12377 if (!sep_obj) {
12378 Py_DECREF(str_obj);
12379 return NULL;
12380 }
12381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 kind1 = PyUnicode_KIND(str_in);
12383 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012384 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 buf1 = PyUnicode_DATA(str_in);
12386 if (kind1 != kind)
12387 buf1 = _PyUnicode_AsKind(str_in, kind);
12388 if (!buf1)
12389 goto onError;
12390 buf2 = PyUnicode_DATA(sep_obj);
12391 if (kind2 != kind)
12392 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12393 if (!buf2)
12394 goto onError;
12395 len1 = PyUnicode_GET_LENGTH(str_obj);
12396 len2 = PyUnicode_GET_LENGTH(sep_obj);
12397
12398 switch(PyUnicode_KIND(str_in)) {
12399 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012400 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12401 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12402 else
12403 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 break;
12405 case PyUnicode_2BYTE_KIND:
12406 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12407 break;
12408 case PyUnicode_4BYTE_KIND:
12409 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12410 break;
12411 default:
12412 assert(0);
12413 out = 0;
12414 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012415
12416 Py_DECREF(sep_obj);
12417 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 if (kind1 != kind)
12419 PyMem_Free(buf1);
12420 if (kind2 != kind)
12421 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012422
12423 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 onError:
12425 Py_DECREF(sep_obj);
12426 Py_DECREF(str_obj);
12427 if (kind1 != kind && buf1)
12428 PyMem_Free(buf1);
12429 if (kind2 != kind && buf2)
12430 PyMem_Free(buf2);
12431 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012432}
12433
12434PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012435 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012436\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012437Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012438the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012439found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012440
12441static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012442unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012443{
Victor Stinner9310abb2011-10-05 00:59:23 +020012444 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012445}
12446
12447PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012448 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012449\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012450Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012452separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453
12454static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012455unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012456{
Victor Stinner9310abb2011-10-05 00:59:23 +020012457 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012458}
12459
Alexander Belopolsky40018472011-02-26 01:02:56 +000012460PyObject *
12461PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012462{
12463 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012465 s = PyUnicode_FromObject(s);
12466 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012467 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012468 if (sep != NULL) {
12469 sep = PyUnicode_FromObject(sep);
12470 if (sep == NULL) {
12471 Py_DECREF(s);
12472 return NULL;
12473 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012474 }
12475
Victor Stinner9310abb2011-10-05 00:59:23 +020012476 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012477
12478 Py_DECREF(s);
12479 Py_XDECREF(sep);
12480 return result;
12481}
12482
12483PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012484 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012485\n\
12486Return a list of the words in S, using sep as the\n\
12487delimiter string, starting at the end of the string and\n\
12488working to the front. If maxsplit is given, at most maxsplit\n\
12489splits are done. If sep is not specified, any whitespace string\n\
12490is a separator.");
12491
12492static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012493unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012494{
12495 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012496 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012497
Martin v. Löwis18e16552006-02-15 17:27:45 +000012498 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012499 return NULL;
12500
12501 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012502 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012503 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012504 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012505 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012506 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012507}
12508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012509PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012510 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511\n\
12512Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012513Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012514is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
12516static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012517unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012519 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012520 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012522 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12523 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524 return NULL;
12525
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012526 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527}
12528
12529static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012530PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531{
Walter Dörwald346737f2007-05-31 10:44:43 +000012532 if (PyUnicode_CheckExact(self)) {
12533 Py_INCREF(self);
12534 return self;
12535 } else
12536 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012537 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538}
12539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012540PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012541 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542\n\
12543Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012544and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545
12546static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012547unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549 return fixup(self, fixswapcase);
12550}
12551
Georg Brandlceee0772007-11-27 23:48:05 +000012552PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012553 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012554\n\
12555Return a translation table usable for str.translate().\n\
12556If there is only one argument, it must be a dictionary mapping Unicode\n\
12557ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012558Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012559If there are two arguments, they must be strings of equal length, and\n\
12560in the resulting dictionary, each character in x will be mapped to the\n\
12561character at the same position in y. If there is a third argument, it\n\
12562must be a string, whose characters will be mapped to None in the result.");
12563
12564static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012565unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012566{
12567 PyObject *x, *y = NULL, *z = NULL;
12568 PyObject *new = NULL, *key, *value;
12569 Py_ssize_t i = 0;
12570 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012571
Georg Brandlceee0772007-11-27 23:48:05 +000012572 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12573 return NULL;
12574 new = PyDict_New();
12575 if (!new)
12576 return NULL;
12577 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 int x_kind, y_kind, z_kind;
12579 void *x_data, *y_data, *z_data;
12580
Georg Brandlceee0772007-11-27 23:48:05 +000012581 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012582 if (!PyUnicode_Check(x)) {
12583 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12584 "be a string if there is a second argument");
12585 goto err;
12586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012588 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12589 "arguments must have equal length");
12590 goto err;
12591 }
12592 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 x_kind = PyUnicode_KIND(x);
12594 y_kind = PyUnicode_KIND(y);
12595 x_data = PyUnicode_DATA(x);
12596 y_data = PyUnicode_DATA(y);
12597 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12598 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12599 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012600 if (!key || !value)
12601 goto err;
12602 res = PyDict_SetItem(new, key, value);
12603 Py_DECREF(key);
12604 Py_DECREF(value);
12605 if (res < 0)
12606 goto err;
12607 }
12608 /* create entries for deleting chars in z */
12609 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 z_kind = PyUnicode_KIND(z);
12611 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012612 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012614 if (!key)
12615 goto err;
12616 res = PyDict_SetItem(new, key, Py_None);
12617 Py_DECREF(key);
12618 if (res < 0)
12619 goto err;
12620 }
12621 }
12622 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 int kind;
12624 void *data;
12625
Georg Brandlceee0772007-11-27 23:48:05 +000012626 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012627 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012628 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12629 "to maketrans it must be a dict");
12630 goto err;
12631 }
12632 /* copy entries into the new dict, converting string keys to int keys */
12633 while (PyDict_Next(x, &i, &key, &value)) {
12634 if (PyUnicode_Check(key)) {
12635 /* convert string keys to integer keys */
12636 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012637 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012638 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12639 "table must be of length 1");
12640 goto err;
12641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 kind = PyUnicode_KIND(key);
12643 data = PyUnicode_DATA(key);
12644 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012645 if (!newkey)
12646 goto err;
12647 res = PyDict_SetItem(new, newkey, value);
12648 Py_DECREF(newkey);
12649 if (res < 0)
12650 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012651 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012652 /* just keep integer keys */
12653 if (PyDict_SetItem(new, key, value) < 0)
12654 goto err;
12655 } else {
12656 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12657 "be strings or integers");
12658 goto err;
12659 }
12660 }
12661 }
12662 return new;
12663 err:
12664 Py_DECREF(new);
12665 return NULL;
12666}
12667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012668PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012669 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670\n\
12671Return a copy of the string S, where all characters have been mapped\n\
12672through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012673Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012674Unmapped characters are left untouched. Characters mapped to None\n\
12675are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676
12677static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681}
12682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012683PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012686Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687
12688static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012689unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691 return fixup(self, fixupper);
12692}
12693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012694PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012697Pad a numeric string S with zeros on the left, to fill a field\n\
12698of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699
12700static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012701unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012703 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012704 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012705 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 int kind;
12707 void *data;
12708 Py_UCS4 chr;
12709
12710 if (PyUnicode_READY(self) == -1)
12711 return NULL;
12712
Martin v. Löwis18e16552006-02-15 17:27:45 +000012713 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714 return NULL;
12715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012717 if (PyUnicode_CheckExact(self)) {
12718 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012719 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012720 }
12721 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012722 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723 }
12724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726
12727 u = pad(self, fill, 0, '0');
12728
Walter Dörwald068325e2002-04-15 13:36:47 +000012729 if (u == NULL)
12730 return NULL;
12731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 kind = PyUnicode_KIND(u);
12733 data = PyUnicode_DATA(u);
12734 chr = PyUnicode_READ(kind, data, fill);
12735
12736 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 PyUnicode_WRITE(kind, data, 0, chr);
12739 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740 }
12741
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012742 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012743 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745
12746#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012747static PyObject *
12748unicode__decimal2ascii(PyObject *self)
12749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012751}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752#endif
12753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012754PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012755 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012757Return True if S starts with the specified prefix, False otherwise.\n\
12758With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012759With optional end, stop comparing S at that position.\n\
12760prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761
12762static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012763unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012764 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012766 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012767 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012768 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012769 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012770 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771
Jesus Ceaac451502011-04-20 17:09:23 +020012772 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012774 if (PyTuple_Check(subobj)) {
12775 Py_ssize_t i;
12776 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012777 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012778 if (substring == NULL)
12779 return NULL;
12780 result = tailmatch(self, substring, start, end, -1);
12781 Py_DECREF(substring);
12782 if (result) {
12783 Py_RETURN_TRUE;
12784 }
12785 }
12786 /* nothing matched */
12787 Py_RETURN_FALSE;
12788 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012789 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012790 if (substring == NULL) {
12791 if (PyErr_ExceptionMatches(PyExc_TypeError))
12792 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12793 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012794 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012795 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012796 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012798 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799}
12800
12801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012802PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012805Return True if S ends with the specified suffix, False otherwise.\n\
12806With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012807With optional end, stop comparing S at that position.\n\
12808suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809
12810static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012811unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012812 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012814 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012815 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012816 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012817 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012818 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819
Jesus Ceaac451502011-04-20 17:09:23 +020012820 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012822 if (PyTuple_Check(subobj)) {
12823 Py_ssize_t i;
12824 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012825 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012826 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012827 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012829 result = tailmatch(self, substring, start, end, +1);
12830 Py_DECREF(substring);
12831 if (result) {
12832 Py_RETURN_TRUE;
12833 }
12834 }
12835 Py_RETURN_FALSE;
12836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012837 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012838 if (substring == NULL) {
12839 if (PyErr_ExceptionMatches(PyExc_TypeError))
12840 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12841 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012842 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012843 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012844 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012846 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847}
12848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012850
12851PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012852 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012853\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012854Return a formatted version of S, using substitutions from args and kwargs.\n\
12855The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012856
Eric Smith27bbca62010-11-04 17:06:58 +000012857PyDoc_STRVAR(format_map__doc__,
12858 "S.format_map(mapping) -> str\n\
12859\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012860Return a formatted version of S, using substitutions from mapping.\n\
12861The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012862
Eric Smith4a7d76d2008-05-30 18:10:19 +000012863static PyObject *
12864unicode__format__(PyObject* self, PyObject* args)
12865{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012866 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012867
12868 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12869 return NULL;
12870
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012871 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012873 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012874}
12875
Eric Smith8c663262007-08-25 02:26:07 +000012876PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012878\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012879Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012880
12881static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012882unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012883{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012884 Py_ssize_t size;
12885
12886 /* If it's a compact object, account for base structure +
12887 character data. */
12888 if (PyUnicode_IS_COMPACT_ASCII(v))
12889 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12890 else if (PyUnicode_IS_COMPACT(v))
12891 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012892 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 else {
12894 /* If it is a two-block object, account for base object, and
12895 for character block if present. */
12896 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012897 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012899 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012900 }
12901 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012902 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012903 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012905 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012906 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907
12908 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012909}
12910
12911PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012912 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012913
12914static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012915unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012916{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012917 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918 if (!copy)
12919 return NULL;
12920 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012921}
12922
Guido van Rossumd57fd912000-03-10 22:53:23 +000012923static PyMethodDef unicode_methods[] = {
12924
12925 /* Order is according to common usage: often used methods should
12926 appear first, since lookup is done sequentially. */
12927
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012928 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012929 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12930 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012931 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012932 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12933 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12934 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12935 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12936 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12937 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12938 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012939 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012940 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12941 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12942 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012943 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012944 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12945 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12946 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012947 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012948 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012949 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012950 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012951 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12952 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12953 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12954 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12955 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12956 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12957 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12958 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12959 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12960 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12961 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12962 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12963 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12964 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012965 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012966 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012967 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012968 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012969 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012970 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012971 {"maketrans", (PyCFunction) unicode_maketrans,
12972 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012973 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012974#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012975 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976#endif
12977
12978#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012979 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012980 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012981#endif
12982
Benjamin Peterson14339b62009-01-31 16:36:08 +000012983 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984 {NULL, NULL}
12985};
12986
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012987static PyObject *
12988unicode_mod(PyObject *v, PyObject *w)
12989{
Brian Curtindfc80e32011-08-10 20:28:54 -050012990 if (!PyUnicode_Check(v))
12991 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012992 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012993}
12994
12995static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012996 0, /*nb_add*/
12997 0, /*nb_subtract*/
12998 0, /*nb_multiply*/
12999 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013000};
13001
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013003 (lenfunc) unicode_length, /* sq_length */
13004 PyUnicode_Concat, /* sq_concat */
13005 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13006 (ssizeargfunc) unicode_getitem, /* sq_item */
13007 0, /* sq_slice */
13008 0, /* sq_ass_item */
13009 0, /* sq_ass_slice */
13010 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011};
13012
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013013static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013014unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 if (PyUnicode_READY(self) == -1)
13017 return NULL;
13018
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013019 if (PyIndex_Check(item)) {
13020 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013021 if (i == -1 && PyErr_Occurred())
13022 return NULL;
13023 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013025 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013026 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013027 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013028 PyObject *result;
13029 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013030 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013031 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013034 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013035 return NULL;
13036 }
13037
13038 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 return PyUnicode_New(0, 0);
13040 } else if (start == 0 && step == 1 &&
13041 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013042 PyUnicode_CheckExact(self)) {
13043 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013044 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013045 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013046 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013047 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013048 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013049 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013050 src_kind = PyUnicode_KIND(self);
13051 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013052 if (!PyUnicode_IS_ASCII(self)) {
13053 kind_limit = kind_maxchar_limit(src_kind);
13054 max_char = 0;
13055 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13056 ch = PyUnicode_READ(src_kind, src_data, cur);
13057 if (ch > max_char) {
13058 max_char = ch;
13059 if (max_char >= kind_limit)
13060 break;
13061 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013062 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013063 }
Victor Stinner55c99112011-10-13 01:17:06 +020013064 else
13065 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013066 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013067 if (result == NULL)
13068 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013069 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013070 dest_data = PyUnicode_DATA(result);
13071
13072 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013073 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13074 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013075 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013076 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013077 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013078 } else {
13079 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13080 return NULL;
13081 }
13082}
13083
13084static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013085 (lenfunc)unicode_length, /* mp_length */
13086 (binaryfunc)unicode_subscript, /* mp_subscript */
13087 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013088};
13089
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091/* Helpers for PyUnicode_Format() */
13092
13093static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013094getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013096 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013098 (*p_argidx)++;
13099 if (arglen < 0)
13100 return args;
13101 else
13102 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103 }
13104 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013105 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106 return NULL;
13107}
13108
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013109/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013111static PyObject *
13112formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013114 char *p;
13115 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013117
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118 x = PyFloat_AsDouble(v);
13119 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013120 return NULL;
13121
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013123 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013124
Eric Smith0923d1d2009-04-16 20:16:10 +000013125 p = PyOS_double_to_string(x, type, prec,
13126 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013127 if (p == NULL)
13128 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013130 PyMem_Free(p);
13131 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132}
13133
Tim Peters38fd5b62000-09-21 05:43:11 +000013134static PyObject*
13135formatlong(PyObject *val, int flags, int prec, int type)
13136{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013137 char *buf;
13138 int len;
13139 PyObject *str; /* temporary string object. */
13140 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013141
Benjamin Peterson14339b62009-01-31 16:36:08 +000013142 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13143 if (!str)
13144 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013146 Py_DECREF(str);
13147 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013148}
13149
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013150static Py_UCS4
13151formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013153 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013154 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013155 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013156 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 goto onError;
13159 }
13160 else {
13161 /* Integer input truncated to a character */
13162 long x;
13163 x = PyLong_AsLong(v);
13164 if (x == -1 && PyErr_Occurred())
13165 goto onError;
13166
13167 if (x < 0 || x > 0x10ffff) {
13168 PyErr_SetString(PyExc_OverflowError,
13169 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013170 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013171 }
13172
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013173 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013174 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013175
Benjamin Peterson29060642009-01-31 22:14:21 +000013176 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013177 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013179 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180}
13181
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013182static int
13183repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13184{
13185 int r;
13186 assert(count > 0);
13187 assert(PyUnicode_Check(obj));
13188 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013189 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013190 if (repeated == NULL)
13191 return -1;
13192 r = _PyAccu_Accumulate(acc, repeated);
13193 Py_DECREF(repeated);
13194 return r;
13195 }
13196 else {
13197 do {
13198 if (_PyAccu_Accumulate(acc, obj))
13199 return -1;
13200 } while (--count);
13201 return 0;
13202 }
13203}
13204
Alexander Belopolsky40018472011-02-26 01:02:56 +000013205PyObject *
13206PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 void *fmt;
13209 int fmtkind;
13210 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013212 int r;
13213 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013216 PyObject *temp = NULL;
13217 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013218 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013219 _PyAccu acc;
13220 static PyObject *plus, *minus, *blank, *zero, *percent;
13221
13222 if (!plus && !(plus = get_latin1_char('+')))
13223 return NULL;
13224 if (!minus && !(minus = get_latin1_char('-')))
13225 return NULL;
13226 if (!blank && !(blank = get_latin1_char(' ')))
13227 return NULL;
13228 if (!zero && !(zero = get_latin1_char('0')))
13229 return NULL;
13230 if (!percent && !(percent = get_latin1_char('%')))
13231 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013232
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013234 PyErr_BadInternalCall();
13235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013237 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013238 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013239 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013240 if (_PyAccu_Init(&acc))
13241 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013242 fmt = PyUnicode_DATA(uformat);
13243 fmtkind = PyUnicode_KIND(uformat);
13244 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13245 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013246
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013248 arglen = PyTuple_Size(args);
13249 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250 }
13251 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013252 arglen = -1;
13253 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013255 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013256 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013257 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258
13259 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013261 PyObject *nonfmt;
13262 Py_ssize_t nonfmtpos;
13263 nonfmtpos = fmtpos++;
13264 while (fmtcnt >= 0 &&
13265 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13266 fmtpos++;
13267 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013268 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013269 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013270 if (nonfmt == NULL)
13271 goto onError;
13272 r = _PyAccu_Accumulate(&acc, nonfmt);
13273 Py_DECREF(nonfmt);
13274 if (r)
13275 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013276 }
13277 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013278 /* Got a format specifier */
13279 int flags = 0;
13280 Py_ssize_t width = -1;
13281 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013282 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013283 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013284 int isnumok;
13285 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013286 void *pbuf = NULL;
13287 Py_ssize_t pindex, len;
13288 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 fmtpos++;
13291 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13292 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 Py_ssize_t keylen;
13294 PyObject *key;
13295 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013296
Benjamin Peterson29060642009-01-31 22:14:21 +000013297 if (dict == NULL) {
13298 PyErr_SetString(PyExc_TypeError,
13299 "format requires a mapping");
13300 goto onError;
13301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013303 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013304 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013305 /* Skip over balanced parentheses */
13306 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013307 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013308 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013310 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 if (fmtcnt < 0 || pcount > 0) {
13315 PyErr_SetString(PyExc_ValueError,
13316 "incomplete format key");
13317 goto onError;
13318 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013319 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013320 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 if (key == NULL)
13322 goto onError;
13323 if (args_owned) {
13324 Py_DECREF(args);
13325 args_owned = 0;
13326 }
13327 args = PyObject_GetItem(dict, key);
13328 Py_DECREF(key);
13329 if (args == NULL) {
13330 goto onError;
13331 }
13332 args_owned = 1;
13333 arglen = -1;
13334 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013335 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013336 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013337 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013338 case '-': flags |= F_LJUST; continue;
13339 case '+': flags |= F_SIGN; continue;
13340 case ' ': flags |= F_BLANK; continue;
13341 case '#': flags |= F_ALT; continue;
13342 case '0': flags |= F_ZERO; continue;
13343 }
13344 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013345 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013346 if (c == '*') {
13347 v = getnextarg(args, arglen, &argidx);
13348 if (v == NULL)
13349 goto onError;
13350 if (!PyLong_Check(v)) {
13351 PyErr_SetString(PyExc_TypeError,
13352 "* wants int");
13353 goto onError;
13354 }
13355 width = PyLong_AsLong(v);
13356 if (width == -1 && PyErr_Occurred())
13357 goto onError;
13358 if (width < 0) {
13359 flags |= F_LJUST;
13360 width = -width;
13361 }
13362 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013363 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013364 }
13365 else if (c >= '0' && c <= '9') {
13366 width = c - '0';
13367 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013368 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013369 if (c < '0' || c > '9')
13370 break;
13371 if ((width*10) / 10 != width) {
13372 PyErr_SetString(PyExc_ValueError,
13373 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013374 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 }
13376 width = width*10 + (c - '0');
13377 }
13378 }
13379 if (c == '.') {
13380 prec = 0;
13381 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013382 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013383 if (c == '*') {
13384 v = getnextarg(args, arglen, &argidx);
13385 if (v == NULL)
13386 goto onError;
13387 if (!PyLong_Check(v)) {
13388 PyErr_SetString(PyExc_TypeError,
13389 "* wants int");
13390 goto onError;
13391 }
13392 prec = PyLong_AsLong(v);
13393 if (prec == -1 && PyErr_Occurred())
13394 goto onError;
13395 if (prec < 0)
13396 prec = 0;
13397 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013398 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 }
13400 else if (c >= '0' && c <= '9') {
13401 prec = c - '0';
13402 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013403 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 if (c < '0' || c > '9')
13405 break;
13406 if ((prec*10) / 10 != prec) {
13407 PyErr_SetString(PyExc_ValueError,
13408 "prec too big");
13409 goto onError;
13410 }
13411 prec = prec*10 + (c - '0');
13412 }
13413 }
13414 } /* prec */
13415 if (fmtcnt >= 0) {
13416 if (c == 'h' || c == 'l' || c == 'L') {
13417 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013418 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 }
13420 }
13421 if (fmtcnt < 0) {
13422 PyErr_SetString(PyExc_ValueError,
13423 "incomplete format");
13424 goto onError;
13425 }
13426 if (c != '%') {
13427 v = getnextarg(args, arglen, &argidx);
13428 if (v == NULL)
13429 goto onError;
13430 }
13431 sign = 0;
13432 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013433 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 switch (c) {
13435
13436 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013437 _PyAccu_Accumulate(&acc, percent);
13438 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013439
13440 case 's':
13441 case 'r':
13442 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013443 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 temp = v;
13445 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013446 }
13447 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 if (c == 's')
13449 temp = PyObject_Str(v);
13450 else if (c == 'r')
13451 temp = PyObject_Repr(v);
13452 else
13453 temp = PyObject_ASCII(v);
13454 if (temp == NULL)
13455 goto onError;
13456 if (PyUnicode_Check(temp))
13457 /* nothing to do */;
13458 else {
13459 Py_DECREF(temp);
13460 PyErr_SetString(PyExc_TypeError,
13461 "%s argument has non-string str()");
13462 goto onError;
13463 }
13464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013465 if (PyUnicode_READY(temp) == -1) {
13466 Py_CLEAR(temp);
13467 goto onError;
13468 }
13469 pbuf = PyUnicode_DATA(temp);
13470 kind = PyUnicode_KIND(temp);
13471 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 if (prec >= 0 && len > prec)
13473 len = prec;
13474 break;
13475
13476 case 'i':
13477 case 'd':
13478 case 'u':
13479 case 'o':
13480 case 'x':
13481 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013482 isnumok = 0;
13483 if (PyNumber_Check(v)) {
13484 PyObject *iobj=NULL;
13485
13486 if (PyLong_Check(v)) {
13487 iobj = v;
13488 Py_INCREF(iobj);
13489 }
13490 else {
13491 iobj = PyNumber_Long(v);
13492 }
13493 if (iobj!=NULL) {
13494 if (PyLong_Check(iobj)) {
13495 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013496 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013497 Py_DECREF(iobj);
13498 if (!temp)
13499 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 if (PyUnicode_READY(temp) == -1) {
13501 Py_CLEAR(temp);
13502 goto onError;
13503 }
13504 pbuf = PyUnicode_DATA(temp);
13505 kind = PyUnicode_KIND(temp);
13506 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013507 sign = 1;
13508 }
13509 else {
13510 Py_DECREF(iobj);
13511 }
13512 }
13513 }
13514 if (!isnumok) {
13515 PyErr_Format(PyExc_TypeError,
13516 "%%%c format: a number is required, "
13517 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13518 goto onError;
13519 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013520 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013522 fillobj = zero;
13523 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013524 break;
13525
13526 case 'e':
13527 case 'E':
13528 case 'f':
13529 case 'F':
13530 case 'g':
13531 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013532 temp = formatfloat(v, flags, prec, c);
13533 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013534 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013535 if (PyUnicode_READY(temp) == -1) {
13536 Py_CLEAR(temp);
13537 goto onError;
13538 }
13539 pbuf = PyUnicode_DATA(temp);
13540 kind = PyUnicode_KIND(temp);
13541 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013543 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013545 fillobj = zero;
13546 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013547 break;
13548
13549 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013550 {
13551 Py_UCS4 ch = formatchar(v);
13552 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013553 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013554 temp = _PyUnicode_FromUCS4(&ch, 1);
13555 if (temp == NULL)
13556 goto onError;
13557 pbuf = PyUnicode_DATA(temp);
13558 kind = PyUnicode_KIND(temp);
13559 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013561 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013562
13563 default:
13564 PyErr_Format(PyExc_ValueError,
13565 "unsupported format character '%c' (0x%x) "
13566 "at index %zd",
13567 (31<=c && c<=126) ? (char)c : '?',
13568 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013569 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013570 goto onError;
13571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013572 /* pbuf is initialized here. */
13573 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013575 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13576 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013578 pindex++;
13579 }
13580 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13581 signobj = plus;
13582 len--;
13583 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 }
13585 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013586 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013587 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013588 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013589 else
13590 sign = 0;
13591 }
13592 if (width < len)
13593 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013595 if (fill != ' ') {
13596 assert(signobj != NULL);
13597 if (_PyAccu_Accumulate(&acc, signobj))
13598 goto onError;
13599 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013600 if (width > len)
13601 width--;
13602 }
13603 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013604 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013605 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013606 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013607 second = get_latin1_char(
13608 PyUnicode_READ(kind, pbuf, pindex + 1));
13609 pindex += 2;
13610 if (second == NULL ||
13611 _PyAccu_Accumulate(&acc, zero) ||
13612 _PyAccu_Accumulate(&acc, second))
13613 goto onError;
13614 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013615 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013616 width -= 2;
13617 if (width < 0)
13618 width = 0;
13619 len -= 2;
13620 }
13621 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013622 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013623 if (repeat_accumulate(&acc, fillobj, width - len))
13624 goto onError;
13625 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013626 }
13627 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013628 if (sign) {
13629 assert(signobj != NULL);
13630 if (_PyAccu_Accumulate(&acc, signobj))
13631 goto onError;
13632 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013633 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013634 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13635 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013636 second = get_latin1_char(
13637 PyUnicode_READ(kind, pbuf, pindex + 1));
13638 pindex += 2;
13639 if (second == NULL ||
13640 _PyAccu_Accumulate(&acc, zero) ||
13641 _PyAccu_Accumulate(&acc, second))
13642 goto onError;
13643 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013644 }
13645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013646 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013647 if (temp != NULL) {
13648 assert(pbuf == PyUnicode_DATA(temp));
13649 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013650 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013651 else {
13652 const char *p = (const char *) pbuf;
13653 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013654 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013655 v = PyUnicode_FromKindAndData(kind, p, len);
13656 }
13657 if (v == NULL)
13658 goto onError;
13659 r = _PyAccu_Accumulate(&acc, v);
13660 Py_DECREF(v);
13661 if (r)
13662 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013663 if (width > len && repeat_accumulate(&acc, blank, width - len))
13664 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013665 if (dict && (argidx < arglen) && c != '%') {
13666 PyErr_SetString(PyExc_TypeError,
13667 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013668 goto onError;
13669 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013670 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013672 } /* until end */
13673 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013674 PyErr_SetString(PyExc_TypeError,
13675 "not all arguments converted during string formatting");
13676 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013677 }
13678
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013679 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013680 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013682 }
13683 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013684 Py_XDECREF(temp);
13685 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013686 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013687
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013689 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013690 Py_XDECREF(temp);
13691 Py_XDECREF(second);
13692 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013693 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013694 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013695 }
13696 return NULL;
13697}
13698
Jeremy Hylton938ace62002-07-17 16:30:39 +000013699static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013700unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13701
Tim Peters6d6c1a32001-08-02 04:15:00 +000013702static PyObject *
13703unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13704{
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013706 static char *kwlist[] = {"object", "encoding", "errors", 0};
13707 char *encoding = NULL;
13708 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013709
Benjamin Peterson14339b62009-01-31 16:36:08 +000013710 if (type != &PyUnicode_Type)
13711 return unicode_subtype_new(type, args, kwds);
13712 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013714 return NULL;
13715 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013716 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013717 if (encoding == NULL && errors == NULL)
13718 return PyObject_Str(x);
13719 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013720 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013721}
13722
Guido van Rossume023fe02001-08-30 03:12:59 +000013723static PyObject *
13724unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13725{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013726 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013727 Py_ssize_t length, char_size;
13728 int share_wstr, share_utf8;
13729 unsigned int kind;
13730 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013731
Benjamin Peterson14339b62009-01-31 16:36:08 +000013732 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013733
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013734 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013735 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013736 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013737 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013738 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013739 return NULL;
13740
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013741 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013742 if (self == NULL) {
13743 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013744 return NULL;
13745 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013746 kind = PyUnicode_KIND(unicode);
13747 length = PyUnicode_GET_LENGTH(unicode);
13748
13749 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013750#ifdef Py_DEBUG
13751 _PyUnicode_HASH(self) = -1;
13752#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013753 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013754#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013755 _PyUnicode_STATE(self).interned = 0;
13756 _PyUnicode_STATE(self).kind = kind;
13757 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013758 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013759 _PyUnicode_STATE(self).ready = 1;
13760 _PyUnicode_WSTR(self) = NULL;
13761 _PyUnicode_UTF8_LENGTH(self) = 0;
13762 _PyUnicode_UTF8(self) = NULL;
13763 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013764 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013765
13766 share_utf8 = 0;
13767 share_wstr = 0;
13768 if (kind == PyUnicode_1BYTE_KIND) {
13769 char_size = 1;
13770 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13771 share_utf8 = 1;
13772 }
13773 else if (kind == PyUnicode_2BYTE_KIND) {
13774 char_size = 2;
13775 if (sizeof(wchar_t) == 2)
13776 share_wstr = 1;
13777 }
13778 else {
13779 assert(kind == PyUnicode_4BYTE_KIND);
13780 char_size = 4;
13781 if (sizeof(wchar_t) == 4)
13782 share_wstr = 1;
13783 }
13784
13785 /* Ensure we won't overflow the length. */
13786 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13787 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013788 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013789 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013790 data = PyObject_MALLOC((length + 1) * char_size);
13791 if (data == NULL) {
13792 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013793 goto onError;
13794 }
13795
Victor Stinnerc3c74152011-10-02 20:39:55 +020013796 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013797 if (share_utf8) {
13798 _PyUnicode_UTF8_LENGTH(self) = length;
13799 _PyUnicode_UTF8(self) = data;
13800 }
13801 if (share_wstr) {
13802 _PyUnicode_WSTR_LENGTH(self) = length;
13803 _PyUnicode_WSTR(self) = (wchar_t *)data;
13804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013805
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013806 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013807 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013808 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013809#ifdef Py_DEBUG
13810 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13811#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013812 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013813 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013814
13815onError:
13816 Py_DECREF(unicode);
13817 Py_DECREF(self);
13818 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013819}
13820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013821PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013822 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013823\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013824Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013825encoding defaults to the current default string encoding.\n\
13826errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013827
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013828static PyObject *unicode_iter(PyObject *seq);
13829
Guido van Rossumd57fd912000-03-10 22:53:23 +000013830PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013831 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013832 "str", /* tp_name */
13833 sizeof(PyUnicodeObject), /* tp_size */
13834 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013836 (destructor)unicode_dealloc, /* tp_dealloc */
13837 0, /* tp_print */
13838 0, /* tp_getattr */
13839 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013840 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013841 unicode_repr, /* tp_repr */
13842 &unicode_as_number, /* tp_as_number */
13843 &unicode_as_sequence, /* tp_as_sequence */
13844 &unicode_as_mapping, /* tp_as_mapping */
13845 (hashfunc) unicode_hash, /* tp_hash*/
13846 0, /* tp_call*/
13847 (reprfunc) unicode_str, /* tp_str */
13848 PyObject_GenericGetAttr, /* tp_getattro */
13849 0, /* tp_setattro */
13850 0, /* tp_as_buffer */
13851 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013852 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013853 unicode_doc, /* tp_doc */
13854 0, /* tp_traverse */
13855 0, /* tp_clear */
13856 PyUnicode_RichCompare, /* tp_richcompare */
13857 0, /* tp_weaklistoffset */
13858 unicode_iter, /* tp_iter */
13859 0, /* tp_iternext */
13860 unicode_methods, /* tp_methods */
13861 0, /* tp_members */
13862 0, /* tp_getset */
13863 &PyBaseObject_Type, /* tp_base */
13864 0, /* tp_dict */
13865 0, /* tp_descr_get */
13866 0, /* tp_descr_set */
13867 0, /* tp_dictoffset */
13868 0, /* tp_init */
13869 0, /* tp_alloc */
13870 unicode_new, /* tp_new */
13871 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013872};
13873
13874/* Initialize the Unicode implementation */
13875
Victor Stinner3a50e702011-10-18 21:21:00 +020013876int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013877{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013878 int i;
13879
Thomas Wouters477c8d52006-05-27 19:21:47 +000013880 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013881 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013882 0x000A, /* LINE FEED */
13883 0x000D, /* CARRIAGE RETURN */
13884 0x001C, /* FILE SEPARATOR */
13885 0x001D, /* GROUP SEPARATOR */
13886 0x001E, /* RECORD SEPARATOR */
13887 0x0085, /* NEXT LINE */
13888 0x2028, /* LINE SEPARATOR */
13889 0x2029, /* PARAGRAPH SEPARATOR */
13890 };
13891
Fred Drakee4315f52000-05-09 19:53:39 +000013892 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013893 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013894 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013895 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013896 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013897
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013898 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013899 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013900 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013901 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013902
13903 /* initialize the linebreak bloom filter */
13904 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013905 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013906 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013907
13908 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013909
13910#ifdef HAVE_MBCS
13911 winver.dwOSVersionInfoSize = sizeof(winver);
13912 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13913 PyErr_SetFromWindowsErr(0);
13914 return -1;
13915 }
13916#endif
13917 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013918}
13919
13920/* Finalize the Unicode implementation */
13921
Christian Heimesa156e092008-02-16 07:38:31 +000013922int
13923PyUnicode_ClearFreeList(void)
13924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013925 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013926}
13927
Guido van Rossumd57fd912000-03-10 22:53:23 +000013928void
Thomas Wouters78890102000-07-22 19:25:51 +000013929_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013930{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013931 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013932
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013933 Py_XDECREF(unicode_empty);
13934 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013935
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013936 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 if (unicode_latin1[i]) {
13938 Py_DECREF(unicode_latin1[i]);
13939 unicode_latin1[i] = NULL;
13940 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013941 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013942 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013943 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013944}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013945
Walter Dörwald16807132007-05-25 13:52:07 +000013946void
13947PyUnicode_InternInPlace(PyObject **p)
13948{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013949 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013950 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013951#ifdef Py_DEBUG
13952 assert(s != NULL);
13953 assert(_PyUnicode_CHECK(s));
13954#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013955 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013956 return;
13957#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 /* If it's a subclass, we don't really know what putting
13959 it in the interned dict might do. */
13960 if (!PyUnicode_CheckExact(s))
13961 return;
13962 if (PyUnicode_CHECK_INTERNED(s))
13963 return;
13964 if (interned == NULL) {
13965 interned = PyDict_New();
13966 if (interned == NULL) {
13967 PyErr_Clear(); /* Don't leave an exception */
13968 return;
13969 }
13970 }
13971 /* It might be that the GetItem call fails even
13972 though the key is present in the dictionary,
13973 namely when this happens during a stack overflow. */
13974 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013975 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013977
Benjamin Peterson29060642009-01-31 22:14:21 +000013978 if (t) {
13979 Py_INCREF(t);
13980 Py_DECREF(*p);
13981 *p = t;
13982 return;
13983 }
Walter Dörwald16807132007-05-25 13:52:07 +000013984
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013986 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013987 PyErr_Clear();
13988 PyThreadState_GET()->recursion_critical = 0;
13989 return;
13990 }
13991 PyThreadState_GET()->recursion_critical = 0;
13992 /* The two references in interned are not counted by refcnt.
13993 The deallocator will take care of this */
13994 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013995 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013996}
13997
13998void
13999PyUnicode_InternImmortal(PyObject **p)
14000{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014001 PyUnicode_InternInPlace(p);
14002 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014003 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014004 Py_INCREF(*p);
14005 }
Walter Dörwald16807132007-05-25 13:52:07 +000014006}
14007
14008PyObject *
14009PyUnicode_InternFromString(const char *cp)
14010{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014011 PyObject *s = PyUnicode_FromString(cp);
14012 if (s == NULL)
14013 return NULL;
14014 PyUnicode_InternInPlace(&s);
14015 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014016}
14017
Alexander Belopolsky40018472011-02-26 01:02:56 +000014018void
14019_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014020{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014021 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014022 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014023 Py_ssize_t i, n;
14024 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014025
Benjamin Peterson14339b62009-01-31 16:36:08 +000014026 if (interned == NULL || !PyDict_Check(interned))
14027 return;
14028 keys = PyDict_Keys(interned);
14029 if (keys == NULL || !PyList_Check(keys)) {
14030 PyErr_Clear();
14031 return;
14032 }
Walter Dörwald16807132007-05-25 13:52:07 +000014033
Benjamin Peterson14339b62009-01-31 16:36:08 +000014034 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14035 detector, interned unicode strings are not forcibly deallocated;
14036 rather, we give them their stolen references back, and then clear
14037 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014038
Benjamin Peterson14339b62009-01-31 16:36:08 +000014039 n = PyList_GET_SIZE(keys);
14040 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014041 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014043 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014044 if (PyUnicode_READY(s) == -1) {
14045 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014046 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014048 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 case SSTATE_NOT_INTERNED:
14050 /* XXX Shouldn't happen */
14051 break;
14052 case SSTATE_INTERNED_IMMORTAL:
14053 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014054 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014055 break;
14056 case SSTATE_INTERNED_MORTAL:
14057 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014058 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014059 break;
14060 default:
14061 Py_FatalError("Inconsistent interned string state.");
14062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014063 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014064 }
14065 fprintf(stderr, "total size of all interned strings: "
14066 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14067 "mortal/immortal\n", mortal_size, immortal_size);
14068 Py_DECREF(keys);
14069 PyDict_Clear(interned);
14070 Py_DECREF(interned);
14071 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014072}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014073
14074
14075/********************* Unicode Iterator **************************/
14076
14077typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014078 PyObject_HEAD
14079 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014080 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014081} unicodeiterobject;
14082
14083static void
14084unicodeiter_dealloc(unicodeiterobject *it)
14085{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014086 _PyObject_GC_UNTRACK(it);
14087 Py_XDECREF(it->it_seq);
14088 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014089}
14090
14091static int
14092unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14093{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014094 Py_VISIT(it->it_seq);
14095 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014096}
14097
14098static PyObject *
14099unicodeiter_next(unicodeiterobject *it)
14100{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014101 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014102
Benjamin Peterson14339b62009-01-31 16:36:08 +000014103 assert(it != NULL);
14104 seq = it->it_seq;
14105 if (seq == NULL)
14106 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014107 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014109 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14110 int kind = PyUnicode_KIND(seq);
14111 void *data = PyUnicode_DATA(seq);
14112 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14113 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014114 if (item != NULL)
14115 ++it->it_index;
14116 return item;
14117 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014118
Benjamin Peterson14339b62009-01-31 16:36:08 +000014119 Py_DECREF(seq);
14120 it->it_seq = NULL;
14121 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014122}
14123
14124static PyObject *
14125unicodeiter_len(unicodeiterobject *it)
14126{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014127 Py_ssize_t len = 0;
14128 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014129 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014130 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014131}
14132
14133PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14134
14135static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014136 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014137 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014138 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014139};
14140
14141PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14143 "str_iterator", /* tp_name */
14144 sizeof(unicodeiterobject), /* tp_basicsize */
14145 0, /* tp_itemsize */
14146 /* methods */
14147 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14148 0, /* tp_print */
14149 0, /* tp_getattr */
14150 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014151 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014152 0, /* tp_repr */
14153 0, /* tp_as_number */
14154 0, /* tp_as_sequence */
14155 0, /* tp_as_mapping */
14156 0, /* tp_hash */
14157 0, /* tp_call */
14158 0, /* tp_str */
14159 PyObject_GenericGetAttr, /* tp_getattro */
14160 0, /* tp_setattro */
14161 0, /* tp_as_buffer */
14162 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14163 0, /* tp_doc */
14164 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14165 0, /* tp_clear */
14166 0, /* tp_richcompare */
14167 0, /* tp_weaklistoffset */
14168 PyObject_SelfIter, /* tp_iter */
14169 (iternextfunc)unicodeiter_next, /* tp_iternext */
14170 unicodeiter_methods, /* tp_methods */
14171 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014172};
14173
14174static PyObject *
14175unicode_iter(PyObject *seq)
14176{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014177 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014178
Benjamin Peterson14339b62009-01-31 16:36:08 +000014179 if (!PyUnicode_Check(seq)) {
14180 PyErr_BadInternalCall();
14181 return NULL;
14182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014183 if (PyUnicode_READY(seq) == -1)
14184 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014185 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14186 if (it == NULL)
14187 return NULL;
14188 it->it_index = 0;
14189 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014190 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014191 _PyObject_GC_TRACK(it);
14192 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014193}
14194
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014195
14196size_t
14197Py_UNICODE_strlen(const Py_UNICODE *u)
14198{
14199 int res = 0;
14200 while(*u++)
14201 res++;
14202 return res;
14203}
14204
14205Py_UNICODE*
14206Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14207{
14208 Py_UNICODE *u = s1;
14209 while ((*u++ = *s2++));
14210 return s1;
14211}
14212
14213Py_UNICODE*
14214Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14215{
14216 Py_UNICODE *u = s1;
14217 while ((*u++ = *s2++))
14218 if (n-- == 0)
14219 break;
14220 return s1;
14221}
14222
14223Py_UNICODE*
14224Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14225{
14226 Py_UNICODE *u1 = s1;
14227 u1 += Py_UNICODE_strlen(u1);
14228 Py_UNICODE_strcpy(u1, s2);
14229 return s1;
14230}
14231
14232int
14233Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14234{
14235 while (*s1 && *s2 && *s1 == *s2)
14236 s1++, s2++;
14237 if (*s1 && *s2)
14238 return (*s1 < *s2) ? -1 : +1;
14239 if (*s1)
14240 return 1;
14241 if (*s2)
14242 return -1;
14243 return 0;
14244}
14245
14246int
14247Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14248{
14249 register Py_UNICODE u1, u2;
14250 for (; n != 0; n--) {
14251 u1 = *s1;
14252 u2 = *s2;
14253 if (u1 != u2)
14254 return (u1 < u2) ? -1 : +1;
14255 if (u1 == '\0')
14256 return 0;
14257 s1++;
14258 s2++;
14259 }
14260 return 0;
14261}
14262
14263Py_UNICODE*
14264Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14265{
14266 const Py_UNICODE *p;
14267 for (p = s; *p; p++)
14268 if (*p == c)
14269 return (Py_UNICODE*)p;
14270 return NULL;
14271}
14272
14273Py_UNICODE*
14274Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14275{
14276 const Py_UNICODE *p;
14277 p = s + Py_UNICODE_strlen(s);
14278 while (p != s) {
14279 p--;
14280 if (*p == c)
14281 return (Py_UNICODE*)p;
14282 }
14283 return NULL;
14284}
Victor Stinner331ea922010-08-10 16:37:20 +000014285
Victor Stinner71133ff2010-09-01 23:43:53 +000014286Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014287PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014288{
Victor Stinner577db2c2011-10-11 22:12:48 +020014289 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014290 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014292 if (!PyUnicode_Check(unicode)) {
14293 PyErr_BadArgument();
14294 return NULL;
14295 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014296 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014297 if (u == NULL)
14298 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014299 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014300 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014301 PyErr_NoMemory();
14302 return NULL;
14303 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014304 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014305 size *= sizeof(Py_UNICODE);
14306 copy = PyMem_Malloc(size);
14307 if (copy == NULL) {
14308 PyErr_NoMemory();
14309 return NULL;
14310 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014311 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014312 return copy;
14313}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014314
Georg Brandl66c221e2010-10-14 07:04:07 +000014315/* A _string module, to export formatter_parser and formatter_field_name_split
14316 to the string.Formatter class implemented in Python. */
14317
14318static PyMethodDef _string_methods[] = {
14319 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14320 METH_O, PyDoc_STR("split the argument as a field name")},
14321 {"formatter_parser", (PyCFunction) formatter_parser,
14322 METH_O, PyDoc_STR("parse the argument as a format string")},
14323 {NULL, NULL}
14324};
14325
14326static struct PyModuleDef _string_module = {
14327 PyModuleDef_HEAD_INIT,
14328 "_string",
14329 PyDoc_STR("string helper module"),
14330 0,
14331 _string_methods,
14332 NULL,
14333 NULL,
14334 NULL,
14335 NULL
14336};
14337
14338PyMODINIT_FUNC
14339PyInit__string(void)
14340{
14341 return PyModule_Create(&_string_module);
14342}
14343
14344
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014345#ifdef __cplusplus
14346}
14347#endif