blob: 30a1377add99b9609c374e6dde575f73c30ecb71 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner910337b2011-10-03 03:20:16 +020069#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020070# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020071#else
72# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
73#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020074
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075#define _PyUnicode_UTF8(op) \
76 (((PyCompactUnicodeObject*)(op))->utf8)
77#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020078 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079 assert(PyUnicode_IS_READY(op)), \
80 PyUnicode_IS_COMPACT_ASCII(op) ? \
81 ((char*)((PyASCIIObject*)(op) + 1)) : \
82 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020083#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 (((PyCompactUnicodeObject*)(op))->utf8_length)
85#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020086 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 assert(PyUnicode_IS_READY(op)), \
88 PyUnicode_IS_COMPACT_ASCII(op) ? \
89 ((PyASCIIObject*)(op))->length : \
90 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020091#define _PyUnicode_WSTR(op) \
92 (((PyASCIIObject*)(op))->wstr)
93#define _PyUnicode_WSTR_LENGTH(op) \
94 (((PyCompactUnicodeObject*)(op))->wstr_length)
95#define _PyUnicode_LENGTH(op) \
96 (((PyASCIIObject *)(op))->length)
97#define _PyUnicode_STATE(op) \
98 (((PyASCIIObject *)(op))->state)
99#define _PyUnicode_HASH(op) \
100 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_KIND(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_GET_LENGTH(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200107#define _PyUnicode_DATA_ANY(op) \
108 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109
Victor Stinner910337b2011-10-03 03:20:16 +0200110#undef PyUnicode_READY
111#define PyUnicode_READY(op) \
112 (assert(_PyUnicode_CHECK(op)), \
113 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200114 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100115 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200116
Victor Stinnerc379ead2011-10-03 12:52:27 +0200117#define _PyUnicode_SHARE_UTF8(op) \
118 (assert(_PyUnicode_CHECK(op)), \
119 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
120 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
121#define _PyUnicode_SHARE_WSTR(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
124
Victor Stinner829c0ad2011-10-03 01:08:02 +0200125/* true if the Unicode object has an allocated UTF-8 memory block
126 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_HAS_UTF8_MEMORY(op) \
128 (assert(_PyUnicode_CHECK(op)), \
129 (!PyUnicode_IS_COMPACT_ASCII(op) \
130 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200131 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
132
Victor Stinner03490912011-10-03 23:45:12 +0200133/* true if the Unicode object has an allocated wstr memory block
134 (not shared with other data) */
135#define _PyUnicode_HAS_WSTR_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (_PyUnicode_WSTR(op) && \
138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200148 to_type *_to = (to_type *) to; \
149 const from_type *_iter = (begin); \
150 const from_type *_end = (end); \
151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
153 _iter + (n & ~ (Py_ssize_t) 3); \
154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200165/* The Unicode string has been modified: reset the hash */
166#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226
Alexander Belopolsky40018472011-02-26 01:02:56 +0000227static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200228unicode_fromascii(const unsigned char *s, Py_ssize_t size);
229static PyObject *
230_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
235
236static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000237unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000238 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100239 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
241
Alexander Belopolsky40018472011-02-26 01:02:56 +0000242static void
243raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300244 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100245 PyObject *unicode,
246 Py_ssize_t startpos, Py_ssize_t endpos,
247 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000248
Christian Heimes190d79e2008-01-30 11:58:22 +0000249/* Same for linebreaks */
250static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000253/* 0x000B, * LINE TABULATION */
254/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000257 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x001C, * FILE SEPARATOR */
259/* 0x001D, * GROUP SEPARATOR */
260/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 1, 1, 1, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000266
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000275};
276
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300277/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
278 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000279Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000280PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000282#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 /* This is actually an illegal character, so it should
286 not be passed to unichr. */
287 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288#endif
289}
290
Victor Stinner910337b2011-10-03 03:20:16 +0200291#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200292int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100293_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200294{
295 PyASCIIObject *ascii;
296 unsigned int kind;
297
298 assert(PyUnicode_Check(op));
299
300 ascii = (PyASCIIObject *)op;
301 kind = ascii->state.kind;
302
Victor Stinnera3b334d2011-10-03 13:53:37 +0200303 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200304 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200305 assert(ascii->state.ready == 1);
306 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200307 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200308 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200309 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200310
Victor Stinnera41463c2011-10-04 01:05:08 +0200311 if (ascii->state.compact == 1) {
312 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(kind == PyUnicode_1BYTE_KIND
314 || kind == PyUnicode_2BYTE_KIND
315 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200316 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 }
320 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
322
323 data = unicode->data.any;
324 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100325 assert(ascii->length == 0);
326 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->wstr != NULL);
332 assert(data == NULL);
333 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200370 /* check that the best kind is used */
371 if (check_content && kind != PyUnicode_WCHAR_KIND)
372 {
373 Py_ssize_t i;
374 Py_UCS4 maxchar = 0;
375 void *data = PyUnicode_DATA(ascii);
376 for (i=0; i < ascii->length; i++)
377 {
378 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
379 if (ch > maxchar)
380 maxchar = ch;
381 }
Victor Stinnerda29cc32011-11-21 14:31:41 +0100382 if (maxchar > 0x10FFFF) {
383 printf("Invalid Unicode string! {");
384 for (i=0; i < ascii->length; i++)
385 {
386 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
387 if (i)
388 printf(", U+%04x", ch);
389 else
390 printf("U+%04x", ch);
391 }
Victor Stinner5bbe5e72011-11-21 22:54:05 +0100392 printf("} (len=%lu)\n", ascii->length);
Victor Stinnerda29cc32011-11-21 14:31:41 +0100393 abort();
394 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100396 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100398 assert(maxchar <= 255);
399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 else
401 assert(maxchar < 128);
402 }
Victor Stinner77faf692011-11-20 18:56:05 +0100403 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100405 assert(maxchar <= 0xFFFF);
406 }
407 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 assert(maxchar >= 0x10000);
Victor Stinner77faf692011-11-20 18:56:05 +0100409 assert(maxchar <= 0x10FFFF);
410 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200411 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400412 return 1;
413}
Victor Stinner910337b2011-10-03 03:20:16 +0200414#endif
415
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100416static PyObject*
417unicode_result_wchar(PyObject *unicode)
418{
419#ifndef Py_DEBUG
420 Py_ssize_t len;
421
422 assert(Py_REFCNT(unicode) == 1);
423
424 len = _PyUnicode_WSTR_LENGTH(unicode);
425 if (len == 0) {
426 Py_INCREF(unicode_empty);
427 Py_DECREF(unicode);
428 return unicode_empty;
429 }
430
431 if (len == 1) {
432 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
433 if (ch < 256) {
434 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
435 Py_DECREF(unicode);
436 return latin1_char;
437 }
438 }
439
440 if (_PyUnicode_Ready(unicode) < 0) {
441 Py_XDECREF(unicode);
442 return NULL;
443 }
444#else
445 /* don't make the result ready in debug mode to ensure that the caller
446 makes the string ready before using it */
447 assert(_PyUnicode_CheckConsistency(unicode, 1));
448#endif
449 return unicode;
450}
451
452static PyObject*
453unicode_result_ready(PyObject *unicode)
454{
455 Py_ssize_t length;
456
457 length = PyUnicode_GET_LENGTH(unicode);
458 if (length == 0) {
459 if (unicode != unicode_empty) {
460 Py_INCREF(unicode_empty);
461 Py_DECREF(unicode);
462 }
463 return unicode_empty;
464 }
465
466 if (length == 1) {
467 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
468 if (ch < 256) {
469 PyObject *latin1_char = unicode_latin1[ch];
470 if (latin1_char != NULL) {
471 if (unicode != latin1_char) {
472 Py_INCREF(latin1_char);
473 Py_DECREF(unicode);
474 }
475 return latin1_char;
476 }
477 else {
478 assert(_PyUnicode_CheckConsistency(unicode, 1));
479 Py_INCREF(unicode);
480 unicode_latin1[ch] = unicode;
481 return unicode;
482 }
483 }
484 }
485
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 return unicode;
488}
489
490static PyObject*
491unicode_result(PyObject *unicode)
492{
493 assert(_PyUnicode_CHECK(unicode));
494 if (PyUnicode_IS_READY(unicode))
495 return unicode_result_ready(unicode);
496 else
497 return unicode_result_wchar(unicode);
498}
499
Victor Stinner3a50e702011-10-18 21:21:00 +0200500#ifdef HAVE_MBCS
501static OSVERSIONINFOEX winver;
502#endif
503
Thomas Wouters477c8d52006-05-27 19:21:47 +0000504/* --- Bloom Filters ----------------------------------------------------- */
505
506/* stuff to implement simple "bloom filters" for Unicode characters.
507 to keep things simple, we use a single bitmask, using the least 5
508 bits from each unicode characters as the bit index. */
509
510/* the linebreak mask is set up by Unicode_Init below */
511
Antoine Pitrouf068f942010-01-13 14:19:12 +0000512#if LONG_BIT >= 128
513#define BLOOM_WIDTH 128
514#elif LONG_BIT >= 64
515#define BLOOM_WIDTH 64
516#elif LONG_BIT >= 32
517#define BLOOM_WIDTH 32
518#else
519#error "LONG_BIT is smaller than 32"
520#endif
521
Thomas Wouters477c8d52006-05-27 19:21:47 +0000522#define BLOOM_MASK unsigned long
523
524static BLOOM_MASK bloom_linebreak;
525
Antoine Pitrouf068f942010-01-13 14:19:12 +0000526#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
527#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528
Benjamin Peterson29060642009-01-31 22:14:21 +0000529#define BLOOM_LINEBREAK(ch) \
530 ((ch) < 128U ? ascii_linebreak[(ch)] : \
531 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Alexander Belopolsky40018472011-02-26 01:02:56 +0000533Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535{
536 /* calculate simple bloom-style bitmask for a given unicode string */
537
Antoine Pitrouf068f942010-01-13 14:19:12 +0000538 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539 Py_ssize_t i;
540
541 mask = 0;
542 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544
545 return mask;
546}
547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200548#define BLOOM_MEMBER(mask, chr, str) \
549 (BLOOM(mask, chr) \
550 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200552/* Compilation of templated routines */
553
554#include "stringlib/asciilib.h"
555#include "stringlib/fastsearch.h"
556#include "stringlib/partition.h"
557#include "stringlib/split.h"
558#include "stringlib/count.h"
559#include "stringlib/find.h"
560#include "stringlib/find_max_char.h"
561#include "stringlib/localeutil.h"
562#include "stringlib/undef.h"
563
564#include "stringlib/ucs1lib.h"
565#include "stringlib/fastsearch.h"
566#include "stringlib/partition.h"
567#include "stringlib/split.h"
568#include "stringlib/count.h"
569#include "stringlib/find.h"
570#include "stringlib/find_max_char.h"
571#include "stringlib/localeutil.h"
572#include "stringlib/undef.h"
573
574#include "stringlib/ucs2lib.h"
575#include "stringlib/fastsearch.h"
576#include "stringlib/partition.h"
577#include "stringlib/split.h"
578#include "stringlib/count.h"
579#include "stringlib/find.h"
580#include "stringlib/find_max_char.h"
581#include "stringlib/localeutil.h"
582#include "stringlib/undef.h"
583
584#include "stringlib/ucs4lib.h"
585#include "stringlib/fastsearch.h"
586#include "stringlib/partition.h"
587#include "stringlib/split.h"
588#include "stringlib/count.h"
589#include "stringlib/find.h"
590#include "stringlib/find_max_char.h"
591#include "stringlib/localeutil.h"
592#include "stringlib/undef.h"
593
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200594#include "stringlib/unicodedefs.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100598#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200599
Guido van Rossumd57fd912000-03-10 22:53:23 +0000600/* --- Unicode Object ----------------------------------------------------- */
601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200602static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200603fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200604
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200605Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
606 Py_ssize_t size, Py_UCS4 ch,
607 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
610
611 switch (kind) {
612 case PyUnicode_1BYTE_KIND:
613 {
614 Py_UCS1 ch1 = (Py_UCS1) ch;
615 if (ch1 == ch)
616 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
617 else
618 return -1;
619 }
620 case PyUnicode_2BYTE_KIND:
621 {
622 Py_UCS2 ch2 = (Py_UCS2) ch;
623 if (ch2 == ch)
624 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
625 else
626 return -1;
627 }
628 case PyUnicode_4BYTE_KIND:
629 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
630 default:
631 assert(0);
632 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634}
635
Victor Stinnerfe226c02011-10-03 03:52:20 +0200636static PyObject*
637resize_compact(PyObject *unicode, Py_ssize_t length)
638{
639 Py_ssize_t char_size;
640 Py_ssize_t struct_size;
641 Py_ssize_t new_size;
642 int share_wstr;
643
644 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200645 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200646 if (PyUnicode_IS_COMPACT_ASCII(unicode))
647 struct_size = sizeof(PyASCIIObject);
648 else
649 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200650 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200651
652 _Py_DEC_REFTOTAL;
653 _Py_ForgetReference(unicode);
654
655 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
656 PyErr_NoMemory();
657 return NULL;
658 }
659 new_size = (struct_size + (length + 1) * char_size);
660
661 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
662 if (unicode == NULL) {
663 PyObject_Del(unicode);
664 PyErr_NoMemory();
665 return NULL;
666 }
667 _Py_NewReference(unicode);
668 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200669 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200671 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
672 _PyUnicode_WSTR_LENGTH(unicode) = length;
673 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
675 length, 0);
676 return unicode;
677}
678
Alexander Belopolsky40018472011-02-26 01:02:56 +0000679static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200680resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681{
Victor Stinner95663112011-10-04 01:03:50 +0200682 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200683 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000685
Victor Stinner95663112011-10-04 01:03:50 +0200686 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200687
688 if (PyUnicode_IS_READY(unicode)) {
689 Py_ssize_t char_size;
690 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200691 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 void *data;
693
694 data = _PyUnicode_DATA_ANY(unicode);
695 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200696 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200697 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
698 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200699 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
700 {
701 PyObject_DEL(_PyUnicode_UTF8(unicode));
702 _PyUnicode_UTF8(unicode) = NULL;
703 _PyUnicode_UTF8_LENGTH(unicode) = 0;
704 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705
706 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
707 PyErr_NoMemory();
708 return -1;
709 }
710 new_size = (length + 1) * char_size;
711
712 data = (PyObject *)PyObject_REALLOC(data, new_size);
713 if (data == NULL) {
714 PyErr_NoMemory();
715 return -1;
716 }
717 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200720 _PyUnicode_WSTR_LENGTH(unicode) = length;
721 }
722 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200723 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 _PyUnicode_UTF8_LENGTH(unicode) = length;
725 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 _PyUnicode_LENGTH(unicode) = length;
727 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200728 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200729 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 }
Victor Stinner95663112011-10-04 01:03:50 +0200733 assert(_PyUnicode_WSTR(unicode) != NULL);
734
735 /* check for integer overflow */
736 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
737 PyErr_NoMemory();
738 return -1;
739 }
740 wstr = _PyUnicode_WSTR(unicode);
741 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
742 if (!wstr) {
743 PyErr_NoMemory();
744 return -1;
745 }
746 _PyUnicode_WSTR(unicode) = wstr;
747 _PyUnicode_WSTR(unicode)[length] = 0;
748 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200749 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000750 return 0;
751}
752
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753static PyObject*
754resize_copy(PyObject *unicode, Py_ssize_t length)
755{
756 Py_ssize_t copy_length;
757 if (PyUnicode_IS_COMPACT(unicode)) {
758 PyObject *copy;
759 assert(PyUnicode_IS_READY(unicode));
760
761 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
762 if (copy == NULL)
763 return NULL;
764
765 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200766 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200768 }
769 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200770 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 assert(_PyUnicode_WSTR(unicode) != NULL);
772 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200773 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 if (w == NULL)
775 return NULL;
776 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
777 copy_length = Py_MIN(copy_length, length);
778 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
779 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200780 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 }
782}
783
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000785 Ux0000 terminated; some code (e.g. new_identifier)
786 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000787
788 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000789 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000790
791*/
792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200793#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200794static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795#endif
796
Alexander Belopolsky40018472011-02-26 01:02:56 +0000797static PyUnicodeObject *
798_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000799{
800 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802
Thomas Wouters477c8d52006-05-27 19:21:47 +0000803 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804 if (length == 0 && unicode_empty != NULL) {
805 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200806 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807 }
808
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000809 /* Ensure we won't overflow the size. */
810 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
811 return (PyUnicodeObject *)PyErr_NoMemory();
812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813 if (length < 0) {
814 PyErr_SetString(PyExc_SystemError,
815 "Negative size passed to _PyUnicode_New");
816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817 }
818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819#ifdef Py_DEBUG
820 ++unicode_old_new_calls;
821#endif
822
823 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
824 if (unicode == NULL)
825 return NULL;
826 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
827 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
828 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000829 PyErr_NoMemory();
830 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832
Jeremy Hyltond8082792003-09-16 19:41:39 +0000833 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000834 * the caller fails before initializing str -- unicode_resize()
835 * reads str[0], and the Keep-Alive optimization can keep memory
836 * allocated for str alive across a call to unicode_dealloc(unicode).
837 * We don't want unicode_resize to read uninitialized memory in
838 * that case.
839 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 _PyUnicode_WSTR(unicode)[0] = 0;
841 _PyUnicode_WSTR(unicode)[length] = 0;
842 _PyUnicode_WSTR_LENGTH(unicode) = length;
843 _PyUnicode_HASH(unicode) = -1;
844 _PyUnicode_STATE(unicode).interned = 0;
845 _PyUnicode_STATE(unicode).kind = 0;
846 _PyUnicode_STATE(unicode).compact = 0;
847 _PyUnicode_STATE(unicode).ready = 0;
848 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200849 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200851 _PyUnicode_UTF8(unicode) = NULL;
852 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100853 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000854 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000855
Benjamin Peterson29060642009-01-31 22:14:21 +0000856 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000857 /* XXX UNREF/NEWREF interface should be more symmetrical */
858 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000859 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000860 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862}
863
Victor Stinnerf42dc442011-10-02 23:33:16 +0200864static const char*
865unicode_kind_name(PyObject *unicode)
866{
Victor Stinner42dfd712011-10-03 14:41:45 +0200867 /* don't check consistency: unicode_kind_name() is called from
868 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200869 if (!PyUnicode_IS_COMPACT(unicode))
870 {
871 if (!PyUnicode_IS_READY(unicode))
872 return "wstr";
873 switch(PyUnicode_KIND(unicode))
874 {
875 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200876 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200877 return "legacy ascii";
878 else
879 return "legacy latin1";
880 case PyUnicode_2BYTE_KIND:
881 return "legacy UCS2";
882 case PyUnicode_4BYTE_KIND:
883 return "legacy UCS4";
884 default:
885 return "<legacy invalid kind>";
886 }
887 }
888 assert(PyUnicode_IS_READY(unicode));
889 switch(PyUnicode_KIND(unicode))
890 {
891 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200892 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200893 return "ascii";
894 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200895 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200897 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200898 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200899 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 default:
901 return "<invalid compact kind>";
902 }
903}
904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200906static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200907
908/* Functions wrapping macros for use in debugger */
909char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200910 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911}
912
913void *_PyUnicode_compact_data(void *unicode) {
914 return _PyUnicode_COMPACT_DATA(unicode);
915}
916void *_PyUnicode_data(void *unicode){
917 printf("obj %p\n", unicode);
918 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
919 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
920 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
921 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
922 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
923 return PyUnicode_DATA(unicode);
924}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200925
926void
927_PyUnicode_Dump(PyObject *op)
928{
929 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200930 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
931 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
932 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200933
Victor Stinnera849a4b2011-10-03 12:12:11 +0200934 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200935 {
936 if (ascii->state.ascii)
937 data = (ascii + 1);
938 else
939 data = (compact + 1);
940 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200941 else
942 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200943 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
944
Victor Stinnera849a4b2011-10-03 12:12:11 +0200945 if (ascii->wstr == data)
946 printf("shared ");
947 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200948
Victor Stinnera3b334d2011-10-03 13:53:37 +0200949 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 printf(" (%zu), ", compact->wstr_length);
951 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
952 printf("shared ");
953 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200954 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200956}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957#endif
958
959PyObject *
960PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
961{
962 PyObject *obj;
963 PyCompactUnicodeObject *unicode;
964 void *data;
965 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200966 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 Py_ssize_t char_size;
968 Py_ssize_t struct_size;
969
970 /* Optimization for empty strings */
971 if (size == 0 && unicode_empty != NULL) {
972 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200973 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 }
975
976#ifdef Py_DEBUG
977 ++unicode_new_new_calls;
978#endif
979
Victor Stinner9e9d6892011-10-04 01:02:02 +0200980 is_ascii = 0;
981 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 struct_size = sizeof(PyCompactUnicodeObject);
983 if (maxchar < 128) {
984 kind_state = PyUnicode_1BYTE_KIND;
985 char_size = 1;
986 is_ascii = 1;
987 struct_size = sizeof(PyASCIIObject);
988 }
989 else if (maxchar < 256) {
990 kind_state = PyUnicode_1BYTE_KIND;
991 char_size = 1;
992 }
993 else if (maxchar < 65536) {
994 kind_state = PyUnicode_2BYTE_KIND;
995 char_size = 2;
996 if (sizeof(wchar_t) == 2)
997 is_sharing = 1;
998 }
999 else {
1000 kind_state = PyUnicode_4BYTE_KIND;
1001 char_size = 4;
1002 if (sizeof(wchar_t) == 4)
1003 is_sharing = 1;
1004 }
1005
1006 /* Ensure we won't overflow the size. */
1007 if (size < 0) {
1008 PyErr_SetString(PyExc_SystemError,
1009 "Negative size passed to PyUnicode_New");
1010 return NULL;
1011 }
1012 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1013 return PyErr_NoMemory();
1014
1015 /* Duplicated allocation code from _PyObject_New() instead of a call to
1016 * PyObject_New() so we are able to allocate space for the object and
1017 * it's data buffer.
1018 */
1019 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1020 if (obj == NULL)
1021 return PyErr_NoMemory();
1022 obj = PyObject_INIT(obj, &PyUnicode_Type);
1023 if (obj == NULL)
1024 return NULL;
1025
1026 unicode = (PyCompactUnicodeObject *)obj;
1027 if (is_ascii)
1028 data = ((PyASCIIObject*)obj) + 1;
1029 else
1030 data = unicode + 1;
1031 _PyUnicode_LENGTH(unicode) = size;
1032 _PyUnicode_HASH(unicode) = -1;
1033 _PyUnicode_STATE(unicode).interned = 0;
1034 _PyUnicode_STATE(unicode).kind = kind_state;
1035 _PyUnicode_STATE(unicode).compact = 1;
1036 _PyUnicode_STATE(unicode).ready = 1;
1037 _PyUnicode_STATE(unicode).ascii = is_ascii;
1038 if (is_ascii) {
1039 ((char*)data)[size] = 0;
1040 _PyUnicode_WSTR(unicode) = NULL;
1041 }
1042 else if (kind_state == PyUnicode_1BYTE_KIND) {
1043 ((char*)data)[size] = 0;
1044 _PyUnicode_WSTR(unicode) = NULL;
1045 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 }
1049 else {
1050 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001051 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 if (kind_state == PyUnicode_2BYTE_KIND)
1053 ((Py_UCS2*)data)[size] = 0;
1054 else /* kind_state == PyUnicode_4BYTE_KIND */
1055 ((Py_UCS4*)data)[size] = 0;
1056 if (is_sharing) {
1057 _PyUnicode_WSTR_LENGTH(unicode) = size;
1058 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1059 }
1060 else {
1061 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1062 _PyUnicode_WSTR(unicode) = NULL;
1063 }
1064 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001065 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 return obj;
1067}
1068
1069#if SIZEOF_WCHAR_T == 2
1070/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1071 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001072 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073
1074 This function assumes that unicode can hold one more code point than wstr
1075 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001076static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001078 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079{
1080 const wchar_t *iter;
1081 Py_UCS4 *ucs4_out;
1082
Victor Stinner910337b2011-10-03 03:20:16 +02001083 assert(unicode != NULL);
1084 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1086 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1087
1088 for (iter = begin; iter < end; ) {
1089 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1090 _PyUnicode_GET_LENGTH(unicode)));
1091 if (*iter >= 0xD800 && *iter <= 0xDBFF
1092 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1093 {
1094 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1095 iter += 2;
1096 }
1097 else {
1098 *ucs4_out++ = *iter;
1099 iter++;
1100 }
1101 }
1102 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1103 _PyUnicode_GET_LENGTH(unicode)));
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105}
1106#endif
1107
Victor Stinnercd9950f2011-10-02 00:34:53 +02001108static int
1109_PyUnicode_Dirty(PyObject *unicode)
1110{
Victor Stinner910337b2011-10-03 03:20:16 +02001111 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001113 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001114 "Cannot modify a string having more than 1 reference");
1115 return -1;
1116 }
1117 _PyUnicode_DIRTY(unicode);
1118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
1266 if (PyUnicode_READY(from))
1267 return -1;
1268 if (PyUnicode_READY(to))
1269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
1283 if (_PyUnicode_Dirty(to))
1284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
1308
Victor Stinnerc53be962011-10-02 21:33:54 +02001309 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 *num_surrogates = 0;
1311 *maxchar = 0;
1312
1313 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001314 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001316#if SIZEOF_WCHAR_T != 2
1317 if (*maxchar >= 0x10000)
1318 return 0;
1319#endif
1320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321#if SIZEOF_WCHAR_T == 2
1322 if (*iter >= 0xD800 && *iter <= 0xDBFF
1323 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1324 {
1325 Py_UCS4 surrogate_val;
1326 surrogate_val = (((iter[0] & 0x3FF)<<10)
1327 | (iter[1] & 0x3FF)) + 0x10000;
1328 ++(*num_surrogates);
1329 if (surrogate_val > *maxchar)
1330 *maxchar = surrogate_val;
1331 iter += 2;
1332 }
1333 else
1334 iter++;
1335#else
1336 iter++;
1337#endif
1338 }
1339 return 0;
1340}
1341
1342#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001343static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344#endif
1345
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001346int
1347_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348{
1349 wchar_t *end;
1350 Py_UCS4 maxchar = 0;
1351 Py_ssize_t num_surrogates;
1352#if SIZEOF_WCHAR_T == 2
1353 Py_ssize_t length_wo_surrogates;
1354#endif
1355
Georg Brandl7597add2011-10-05 16:36:47 +02001356 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001357 strings were created using _PyObject_New() and where no canonical
1358 representation (the str field) has been set yet aka strings
1359 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001360 assert(_PyUnicode_CHECK(unicode));
1361 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001363 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001364 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001365 /* Actually, it should neither be interned nor be anything else: */
1366 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367
1368#ifdef Py_DEBUG
1369 ++unicode_ready_calls;
1370#endif
1371
1372 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001373 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001374 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376
1377 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001378 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1379 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 PyErr_NoMemory();
1381 return -1;
1382 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001383 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 _PyUnicode_WSTR(unicode), end,
1385 PyUnicode_1BYTE_DATA(unicode));
1386 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1387 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1388 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1389 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001390 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001391 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001392 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 }
1394 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001395 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001396 _PyUnicode_UTF8(unicode) = NULL;
1397 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398 }
1399 PyObject_FREE(_PyUnicode_WSTR(unicode));
1400 _PyUnicode_WSTR(unicode) = NULL;
1401 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1402 }
1403 /* In this case we might have to convert down from 4-byte native
1404 wchar_t to 2-byte unicode. */
1405 else if (maxchar < 65536) {
1406 assert(num_surrogates == 0 &&
1407 "FindMaxCharAndNumSurrogatePairs() messed up");
1408
Victor Stinner506f5922011-09-28 22:34:18 +02001409#if SIZEOF_WCHAR_T == 2
1410 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001411 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001412 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1413 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1414 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001415 _PyUnicode_UTF8(unicode) = NULL;
1416 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001417#else
1418 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001420 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001421 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001422 PyErr_NoMemory();
1423 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 }
Victor Stinner506f5922011-09-28 22:34:18 +02001425 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1426 _PyUnicode_WSTR(unicode), end,
1427 PyUnicode_2BYTE_DATA(unicode));
1428 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1429 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1430 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001431 _PyUnicode_UTF8(unicode) = NULL;
1432 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001433 PyObject_FREE(_PyUnicode_WSTR(unicode));
1434 _PyUnicode_WSTR(unicode) = NULL;
1435 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1436#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1439 else {
1440#if SIZEOF_WCHAR_T == 2
1441 /* in case the native representation is 2-bytes, we need to allocate a
1442 new normalized 4-byte version. */
1443 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001444 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1445 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 PyErr_NoMemory();
1447 return -1;
1448 }
1449 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1450 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001451 _PyUnicode_UTF8(unicode) = NULL;
1452 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001453 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1454 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001455 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 PyObject_FREE(_PyUnicode_WSTR(unicode));
1457 _PyUnicode_WSTR(unicode) = NULL;
1458 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1459#else
1460 assert(num_surrogates == 0);
1461
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001464 _PyUnicode_UTF8(unicode) = NULL;
1465 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1467#endif
1468 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1469 }
1470 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001471 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 return 0;
1473}
1474
Alexander Belopolsky40018472011-02-26 01:02:56 +00001475static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001476unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477{
Walter Dörwald16807132007-05-25 13:52:07 +00001478 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 case SSTATE_NOT_INTERNED:
1480 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001481
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 case SSTATE_INTERNED_MORTAL:
1483 /* revive dead object temporarily for DelItem */
1484 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001485 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001486 Py_FatalError(
1487 "deletion of interned string failed");
1488 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001489
Benjamin Peterson29060642009-01-31 22:14:21 +00001490 case SSTATE_INTERNED_IMMORTAL:
1491 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001492
Benjamin Peterson29060642009-01-31 22:14:21 +00001493 default:
1494 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001495 }
1496
Victor Stinner03490912011-10-03 23:45:12 +02001497 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001499 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001500 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501
1502 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001503 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 }
1505 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001506 if (_PyUnicode_DATA_ANY(unicode))
1507 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001508 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 }
1510}
1511
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001512#ifdef Py_DEBUG
1513static int
1514unicode_is_singleton(PyObject *unicode)
1515{
1516 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1517 if (unicode == unicode_empty)
1518 return 1;
1519 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1520 {
1521 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1522 if (ch < 256 && unicode_latin1[ch] == unicode)
1523 return 1;
1524 }
1525 return 0;
1526}
1527#endif
1528
Alexander Belopolsky40018472011-02-26 01:02:56 +00001529static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001530unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001531{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 if (Py_REFCNT(unicode) != 1)
1533 return 0;
1534 if (PyUnicode_CHECK_INTERNED(unicode))
1535 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001536#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001537 /* singleton refcount is greater than 1 */
1538 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001539#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001540 return 1;
1541}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001542
Victor Stinnerfe226c02011-10-03 03:52:20 +02001543static int
1544unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1545{
1546 PyObject *unicode;
1547 Py_ssize_t old_length;
1548
1549 assert(p_unicode != NULL);
1550 unicode = *p_unicode;
1551
1552 assert(unicode != NULL);
1553 assert(PyUnicode_Check(unicode));
1554 assert(0 <= length);
1555
Victor Stinner910337b2011-10-03 03:20:16 +02001556 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001557 old_length = PyUnicode_WSTR_LENGTH(unicode);
1558 else
1559 old_length = PyUnicode_GET_LENGTH(unicode);
1560 if (old_length == length)
1561 return 0;
1562
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001563 if (length == 0) {
1564 Py_DECREF(*p_unicode);
1565 *p_unicode = unicode_empty;
1566 Py_INCREF(*p_unicode);
1567 return 0;
1568 }
1569
Victor Stinnerfe226c02011-10-03 03:52:20 +02001570 if (!unicode_resizable(unicode)) {
1571 PyObject *copy = resize_copy(unicode, length);
1572 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001574 Py_DECREF(*p_unicode);
1575 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001576 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001577 }
1578
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579 if (PyUnicode_IS_COMPACT(unicode)) {
1580 *p_unicode = resize_compact(unicode, length);
1581 if (*p_unicode == NULL)
1582 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001583 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001585 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001586 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001587}
1588
Alexander Belopolsky40018472011-02-26 01:02:56 +00001589int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001590PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001591{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592 PyObject *unicode;
1593 if (p_unicode == NULL) {
1594 PyErr_BadInternalCall();
1595 return -1;
1596 }
1597 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001598 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 {
1600 PyErr_BadInternalCall();
1601 return -1;
1602 }
1603 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001604}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001605
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001606static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001607unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608{
1609 PyObject *result;
1610 assert(PyUnicode_IS_READY(*p_unicode));
1611 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1612 return 0;
1613 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1614 maxchar);
1615 if (result == NULL)
1616 return -1;
1617 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1618 PyUnicode_GET_LENGTH(*p_unicode));
1619 Py_DECREF(*p_unicode);
1620 *p_unicode = result;
1621 return 0;
1622}
1623
1624static int
1625unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1626 Py_UCS4 ch)
1627{
1628 if (unicode_widen(p_unicode, ch) < 0)
1629 return -1;
1630 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1631 PyUnicode_DATA(*p_unicode),
1632 (*pos)++, ch);
1633 return 0;
1634}
1635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636static PyObject*
1637get_latin1_char(unsigned char ch)
1638{
Victor Stinnera464fc12011-10-02 20:39:30 +02001639 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode)
1643 return NULL;
1644 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001645 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 unicode_latin1[ch] = unicode;
1647 }
1648 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001649 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650}
1651
Alexander Belopolsky40018472011-02-26 01:02:56 +00001652PyObject *
1653PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001655 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 Py_UCS4 maxchar = 0;
1657 Py_ssize_t num_surrogates;
1658
1659 if (u == NULL)
1660 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001662 /* If the Unicode data is known at construction time, we can apply
1663 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 /* Optimization for empty strings */
1666 if (size == 0 && unicode_empty != NULL) {
1667 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001668 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001669 }
Tim Petersced69f82003-09-16 20:30:58 +00001670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 /* Single character Unicode objects in the Latin-1 range are
1672 shared when using this constructor */
1673 if (size == 1 && *u < 256)
1674 return get_latin1_char((unsigned char)*u);
1675
1676 /* If not empty and not single character, copy the Unicode data
1677 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001678 if (find_maxchar_surrogates(u, u + size,
1679 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 return NULL;
1681
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001682 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 if (!unicode)
1685 return NULL;
1686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 switch (PyUnicode_KIND(unicode)) {
1688 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001689 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1691 break;
1692 case PyUnicode_2BYTE_KIND:
1693#if Py_UNICODE_SIZE == 2
1694 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1695#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001696 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1698#endif
1699 break;
1700 case PyUnicode_4BYTE_KIND:
1701#if SIZEOF_WCHAR_T == 2
1702 /* This is the only case which has to process surrogates, thus
1703 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001704 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705#else
1706 assert(num_surrogates == 0);
1707 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1708#endif
1709 break;
1710 default:
1711 assert(0 && "Impossible state");
1712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001714 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715}
1716
Alexander Belopolsky40018472011-02-26 01:02:56 +00001717PyObject *
1718PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001719{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001720 if (size < 0) {
1721 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001722 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001723 return NULL;
1724 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001725
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001726 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001727 some optimizations which share commonly used objects.
1728 Also, this means the input must be UTF-8, so fall back to the
1729 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730 if (u != NULL) {
1731
Benjamin Peterson29060642009-01-31 22:14:21 +00001732 /* Optimization for empty strings */
1733 if (size == 0 && unicode_empty != NULL) {
1734 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001735 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001736 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001737
1738 /* Single characters are shared when using this constructor.
1739 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001740 if (size == 1 && (unsigned char)*u < 128)
1741 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001742
1743 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001744 }
1745
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001746 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001747}
1748
Alexander Belopolsky40018472011-02-26 01:02:56 +00001749PyObject *
1750PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001751{
1752 size_t size = strlen(u);
1753 if (size > PY_SSIZE_T_MAX) {
1754 PyErr_SetString(PyExc_OverflowError, "input too long");
1755 return NULL;
1756 }
1757
1758 return PyUnicode_FromStringAndSize(u, size);
1759}
1760
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001761PyObject *
1762_PyUnicode_FromId(_Py_Identifier *id)
1763{
1764 if (!id->object) {
1765 id->object = PyUnicode_FromString(id->string);
1766 if (!id->object)
1767 return NULL;
1768 PyUnicode_InternInPlace(&id->object);
1769 assert(!id->next);
1770 id->next = static_strings;
1771 static_strings = id;
1772 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001773 return id->object;
1774}
1775
1776void
1777_PyUnicode_ClearStaticStrings()
1778{
1779 _Py_Identifier *i;
1780 for (i = static_strings; i; i = i->next) {
1781 Py_DECREF(i->object);
1782 i->object = NULL;
1783 i->next = NULL;
1784 }
1785}
1786
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001787/* Internal function, don't check maximum character */
1788
Victor Stinnere57b1c02011-09-28 22:20:48 +02001789static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001790unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001791{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001792 PyObject *res;
1793#ifdef Py_DEBUG
1794 const unsigned char *p;
1795 const unsigned char *end = s + size;
1796 for (p=s; p < end; p++) {
1797 assert(*p < 128);
1798 }
1799#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001800 if (size == 1)
1801 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001802 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001803 if (!res)
1804 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001805 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001806 return res;
1807}
1808
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001809static Py_UCS4
1810kind_maxchar_limit(unsigned int kind)
1811{
1812 switch(kind) {
1813 case PyUnicode_1BYTE_KIND:
1814 return 0x80;
1815 case PyUnicode_2BYTE_KIND:
1816 return 0x100;
1817 case PyUnicode_4BYTE_KIND:
1818 return 0x10000;
1819 default:
1820 assert(0 && "invalid kind");
1821 return 0x10ffff;
1822 }
1823}
1824
Victor Stinner702c7342011-10-05 13:50:52 +02001825static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001826_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001829 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001830
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001831 if (size == 0) {
1832 Py_INCREF(unicode_empty);
1833 return unicode_empty;
1834 }
1835 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001836 if (size == 1)
1837 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001838
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001839 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001840 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 if (!res)
1842 return NULL;
1843 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001844 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001846}
1847
Victor Stinnere57b1c02011-09-28 22:20:48 +02001848static PyObject*
1849_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850{
1851 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001852 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001853
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001854 if (size == 0) {
1855 Py_INCREF(unicode_empty);
1856 return unicode_empty;
1857 }
1858 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001859 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001860 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001861
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001862 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 if (!res)
1865 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001866 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001868 else {
1869 _PyUnicode_CONVERT_BYTES(
1870 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1871 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001872 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 return res;
1874}
1875
Victor Stinnere57b1c02011-09-28 22:20:48 +02001876static PyObject*
1877_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878{
1879 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001880 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001881
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001882 if (size == 0) {
1883 Py_INCREF(unicode_empty);
1884 return unicode_empty;
1885 }
1886 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001887 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001888 return get_latin1_char((unsigned char)u[0]);
1889
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001890 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001891 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 if (!res)
1893 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001894 if (max_char < 256)
1895 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1896 PyUnicode_1BYTE_DATA(res));
1897 else if (max_char < 0x10000)
1898 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1899 PyUnicode_2BYTE_DATA(res));
1900 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001902 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 return res;
1904}
1905
1906PyObject*
1907PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1908{
1909 switch(kind) {
1910 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001911 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001913 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001915 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001916 default:
1917 assert(0 && "invalid kind");
1918 PyErr_SetString(PyExc_SystemError, "invalid kind");
1919 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921}
1922
Victor Stinner25a4b292011-10-06 12:31:55 +02001923/* Ensure that a string uses the most efficient storage, if it is not the
1924 case: create a new string with of the right kind. Write NULL into *p_unicode
1925 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001926static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001927unicode_adjust_maxchar(PyObject **p_unicode)
1928{
1929 PyObject *unicode, *copy;
1930 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001931 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001932 unsigned int kind;
1933
1934 assert(p_unicode != NULL);
1935 unicode = *p_unicode;
1936 assert(PyUnicode_IS_READY(unicode));
1937 if (PyUnicode_IS_ASCII(unicode))
1938 return;
1939
1940 len = PyUnicode_GET_LENGTH(unicode);
1941 kind = PyUnicode_KIND(unicode);
1942 if (kind == PyUnicode_1BYTE_KIND) {
1943 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001944 max_char = ucs1lib_find_max_char(u, u + len);
1945 if (max_char >= 128)
1946 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001947 }
1948 else if (kind == PyUnicode_2BYTE_KIND) {
1949 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001950 max_char = ucs2lib_find_max_char(u, u + len);
1951 if (max_char >= 256)
1952 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001953 }
1954 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001955 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001956 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001957 max_char = ucs4lib_find_max_char(u, u + len);
1958 if (max_char >= 0x10000)
1959 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001960 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001961 copy = PyUnicode_New(len, max_char);
1962 copy_characters(copy, 0, unicode, 0, len);
1963 Py_DECREF(unicode);
1964 *p_unicode = copy;
1965}
1966
Victor Stinner034f6cf2011-09-30 02:26:44 +02001967PyObject*
1968PyUnicode_Copy(PyObject *unicode)
1969{
Victor Stinner87af4f22011-11-21 23:03:47 +01001970 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001971 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001972
Victor Stinner034f6cf2011-09-30 02:26:44 +02001973 if (!PyUnicode_Check(unicode)) {
1974 PyErr_BadInternalCall();
1975 return NULL;
1976 }
1977 if (PyUnicode_READY(unicode))
1978 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001979
Victor Stinner87af4f22011-11-21 23:03:47 +01001980 length = PyUnicode_GET_LENGTH(unicode);
1981 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001982 if (!copy)
1983 return NULL;
1984 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1985
Victor Stinner87af4f22011-11-21 23:03:47 +01001986 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1987 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001988 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001989 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001990}
1991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001992
Victor Stinnerbc603d12011-10-02 01:00:40 +02001993/* Widen Unicode objects to larger buffers. Don't write terminating null
1994 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995
1996void*
1997_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1998{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001999 Py_ssize_t len;
2000 void *result;
2001 unsigned int skind;
2002
2003 if (PyUnicode_READY(s))
2004 return NULL;
2005
2006 len = PyUnicode_GET_LENGTH(s);
2007 skind = PyUnicode_KIND(s);
2008 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002009 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 return NULL;
2011 }
2012 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002013 case PyUnicode_2BYTE_KIND:
2014 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2015 if (!result)
2016 return PyErr_NoMemory();
2017 assert(skind == PyUnicode_1BYTE_KIND);
2018 _PyUnicode_CONVERT_BYTES(
2019 Py_UCS1, Py_UCS2,
2020 PyUnicode_1BYTE_DATA(s),
2021 PyUnicode_1BYTE_DATA(s) + len,
2022 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002024 case PyUnicode_4BYTE_KIND:
2025 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2026 if (!result)
2027 return PyErr_NoMemory();
2028 if (skind == PyUnicode_2BYTE_KIND) {
2029 _PyUnicode_CONVERT_BYTES(
2030 Py_UCS2, Py_UCS4,
2031 PyUnicode_2BYTE_DATA(s),
2032 PyUnicode_2BYTE_DATA(s) + len,
2033 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002035 else {
2036 assert(skind == PyUnicode_1BYTE_KIND);
2037 _PyUnicode_CONVERT_BYTES(
2038 Py_UCS1, Py_UCS4,
2039 PyUnicode_1BYTE_DATA(s),
2040 PyUnicode_1BYTE_DATA(s) + len,
2041 result);
2042 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002043 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002044 default:
2045 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 }
Victor Stinner01698042011-10-04 00:04:26 +02002047 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 return NULL;
2049}
2050
2051static Py_UCS4*
2052as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2053 int copy_null)
2054{
2055 int kind;
2056 void *data;
2057 Py_ssize_t len, targetlen;
2058 if (PyUnicode_READY(string) == -1)
2059 return NULL;
2060 kind = PyUnicode_KIND(string);
2061 data = PyUnicode_DATA(string);
2062 len = PyUnicode_GET_LENGTH(string);
2063 targetlen = len;
2064 if (copy_null)
2065 targetlen++;
2066 if (!target) {
2067 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2068 PyErr_NoMemory();
2069 return NULL;
2070 }
2071 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2072 if (!target) {
2073 PyErr_NoMemory();
2074 return NULL;
2075 }
2076 }
2077 else {
2078 if (targetsize < targetlen) {
2079 PyErr_Format(PyExc_SystemError,
2080 "string is longer than the buffer");
2081 if (copy_null && 0 < targetsize)
2082 target[0] = 0;
2083 return NULL;
2084 }
2085 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002086 if (kind == PyUnicode_1BYTE_KIND) {
2087 Py_UCS1 *start = (Py_UCS1 *) data;
2088 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002090 else if (kind == PyUnicode_2BYTE_KIND) {
2091 Py_UCS2 *start = (Py_UCS2 *) data;
2092 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2093 }
2094 else {
2095 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002096 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098 if (copy_null)
2099 target[len] = 0;
2100 return target;
2101}
2102
2103Py_UCS4*
2104PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2105 int copy_null)
2106{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002107 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002108 PyErr_BadInternalCall();
2109 return NULL;
2110 }
2111 return as_ucs4(string, target, targetsize, copy_null);
2112}
2113
2114Py_UCS4*
2115PyUnicode_AsUCS4Copy(PyObject *string)
2116{
2117 return as_ucs4(string, NULL, 0, 1);
2118}
2119
2120#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002121
Alexander Belopolsky40018472011-02-26 01:02:56 +00002122PyObject *
2123PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002126 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002127 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002128 PyErr_BadInternalCall();
2129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 }
2131
Martin v. Löwis790465f2008-04-05 20:41:37 +00002132 if (size == -1) {
2133 size = wcslen(w);
2134 }
2135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137}
2138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002140
Walter Dörwald346737f2007-05-31 10:44:43 +00002141static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002142makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2143 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002144{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002145 *fmt++ = '%';
2146 if (width) {
2147 if (zeropad)
2148 *fmt++ = '0';
2149 fmt += sprintf(fmt, "%d", width);
2150 }
2151 if (precision)
2152 fmt += sprintf(fmt, ".%d", precision);
2153 if (longflag)
2154 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002155 else if (longlongflag) {
2156 /* longlongflag should only ever be nonzero on machines with
2157 HAVE_LONG_LONG defined */
2158#ifdef HAVE_LONG_LONG
2159 char *f = PY_FORMAT_LONG_LONG;
2160 while (*f)
2161 *fmt++ = *f++;
2162#else
2163 /* we shouldn't ever get here */
2164 assert(0);
2165 *fmt++ = 'l';
2166#endif
2167 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002168 else if (size_tflag) {
2169 char *f = PY_FORMAT_SIZE_T;
2170 while (*f)
2171 *fmt++ = *f++;
2172 }
2173 *fmt++ = c;
2174 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002175}
2176
Victor Stinner96865452011-03-01 23:44:09 +00002177/* helper for PyUnicode_FromFormatV() */
2178
2179static const char*
2180parse_format_flags(const char *f,
2181 int *p_width, int *p_precision,
2182 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2183{
2184 int width, precision, longflag, longlongflag, size_tflag;
2185
2186 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2187 f++;
2188 width = 0;
2189 while (Py_ISDIGIT((unsigned)*f))
2190 width = (width*10) + *f++ - '0';
2191 precision = 0;
2192 if (*f == '.') {
2193 f++;
2194 while (Py_ISDIGIT((unsigned)*f))
2195 precision = (precision*10) + *f++ - '0';
2196 if (*f == '%') {
2197 /* "%.3%s" => f points to "3" */
2198 f--;
2199 }
2200 }
2201 if (*f == '\0') {
2202 /* bogus format "%.1" => go backward, f points to "1" */
2203 f--;
2204 }
2205 if (p_width != NULL)
2206 *p_width = width;
2207 if (p_precision != NULL)
2208 *p_precision = precision;
2209
2210 /* Handle %ld, %lu, %lld and %llu. */
2211 longflag = 0;
2212 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002213 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002214
2215 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002216 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002217 longflag = 1;
2218 ++f;
2219 }
2220#ifdef HAVE_LONG_LONG
2221 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002222 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002223 longlongflag = 1;
2224 f += 2;
2225 }
2226#endif
2227 }
2228 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002229 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002230 size_tflag = 1;
2231 ++f;
2232 }
2233 if (p_longflag != NULL)
2234 *p_longflag = longflag;
2235 if (p_longlongflag != NULL)
2236 *p_longlongflag = longlongflag;
2237 if (p_size_tflag != NULL)
2238 *p_size_tflag = size_tflag;
2239 return f;
2240}
2241
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002242/* maximum number of characters required for output of %ld. 21 characters
2243 allows for 64-bit integers (in decimal) and an optional sign. */
2244#define MAX_LONG_CHARS 21
2245/* maximum number of characters required for output of %lld.
2246 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2247 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2248#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2249
Walter Dörwaldd2034312007-05-18 16:29:38 +00002250PyObject *
2251PyUnicode_FromFormatV(const char *format, va_list vargs)
2252{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002253 va_list count;
2254 Py_ssize_t callcount = 0;
2255 PyObject **callresults = NULL;
2256 PyObject **callresult = NULL;
2257 Py_ssize_t n = 0;
2258 int width = 0;
2259 int precision = 0;
2260 int zeropad;
2261 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002262 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002263 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002264 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2266 Py_UCS4 argmaxchar;
2267 Py_ssize_t numbersize = 0;
2268 char *numberresults = NULL;
2269 char *numberresult = NULL;
2270 Py_ssize_t i;
2271 int kind;
2272 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002273
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002274 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002275 /* step 1: count the number of %S/%R/%A/%s format specifications
2276 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2277 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002279 * also estimate a upper bound for all the number formats in the string,
2280 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 for (f = format; *f; f++) {
2283 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002284 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002285 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2286 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2287 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2288 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002291#ifdef HAVE_LONG_LONG
2292 if (longlongflag) {
2293 if (width < MAX_LONG_LONG_CHARS)
2294 width = MAX_LONG_LONG_CHARS;
2295 }
2296 else
2297#endif
2298 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2299 including sign. Decimal takes the most space. This
2300 isn't enough for octal. If a width is specified we
2301 need more (which we allocate later). */
2302 if (width < MAX_LONG_CHARS)
2303 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002304
2305 /* account for the size + '\0' to separate numbers
2306 inside of the numberresults buffer */
2307 numbersize += (width + 1);
2308 }
2309 }
2310 else if ((unsigned char)*f > 127) {
2311 PyErr_Format(PyExc_ValueError,
2312 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2313 "string, got a non-ASCII byte: 0x%02x",
2314 (unsigned char)*f);
2315 return NULL;
2316 }
2317 }
2318 /* step 2: allocate memory for the results of
2319 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2320 if (callcount) {
2321 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2322 if (!callresults) {
2323 PyErr_NoMemory();
2324 return NULL;
2325 }
2326 callresult = callresults;
2327 }
2328 /* step 2.5: allocate memory for the results of formating numbers */
2329 if (numbersize) {
2330 numberresults = PyObject_Malloc(numbersize);
2331 if (!numberresults) {
2332 PyErr_NoMemory();
2333 goto fail;
2334 }
2335 numberresult = numberresults;
2336 }
2337
2338 /* step 3: format numbers and figure out how large a buffer we need */
2339 for (f = format; *f; f++) {
2340 if (*f == '%') {
2341 const char* p;
2342 int longflag;
2343 int longlongflag;
2344 int size_tflag;
2345 int numprinted;
2346
2347 p = f;
2348 zeropad = (f[1] == '0');
2349 f = parse_format_flags(f, &width, &precision,
2350 &longflag, &longlongflag, &size_tflag);
2351 switch (*f) {
2352 case 'c':
2353 {
2354 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002355 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356 n++;
2357 break;
2358 }
2359 case '%':
2360 n++;
2361 break;
2362 case 'i':
2363 case 'd':
2364 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2365 width, precision, *f);
2366 if (longflag)
2367 numprinted = sprintf(numberresult, fmt,
2368 va_arg(count, long));
2369#ifdef HAVE_LONG_LONG
2370 else if (longlongflag)
2371 numprinted = sprintf(numberresult, fmt,
2372 va_arg(count, PY_LONG_LONG));
2373#endif
2374 else if (size_tflag)
2375 numprinted = sprintf(numberresult, fmt,
2376 va_arg(count, Py_ssize_t));
2377 else
2378 numprinted = sprintf(numberresult, fmt,
2379 va_arg(count, int));
2380 n += numprinted;
2381 /* advance by +1 to skip over the '\0' */
2382 numberresult += (numprinted + 1);
2383 assert(*(numberresult - 1) == '\0');
2384 assert(*(numberresult - 2) != '\0');
2385 assert(numprinted >= 0);
2386 assert(numberresult <= numberresults + numbersize);
2387 break;
2388 case 'u':
2389 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2390 width, precision, 'u');
2391 if (longflag)
2392 numprinted = sprintf(numberresult, fmt,
2393 va_arg(count, unsigned long));
2394#ifdef HAVE_LONG_LONG
2395 else if (longlongflag)
2396 numprinted = sprintf(numberresult, fmt,
2397 va_arg(count, unsigned PY_LONG_LONG));
2398#endif
2399 else if (size_tflag)
2400 numprinted = sprintf(numberresult, fmt,
2401 va_arg(count, size_t));
2402 else
2403 numprinted = sprintf(numberresult, fmt,
2404 va_arg(count, unsigned int));
2405 n += numprinted;
2406 numberresult += (numprinted + 1);
2407 assert(*(numberresult - 1) == '\0');
2408 assert(*(numberresult - 2) != '\0');
2409 assert(numprinted >= 0);
2410 assert(numberresult <= numberresults + numbersize);
2411 break;
2412 case 'x':
2413 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2414 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2415 n += numprinted;
2416 numberresult += (numprinted + 1);
2417 assert(*(numberresult - 1) == '\0');
2418 assert(*(numberresult - 2) != '\0');
2419 assert(numprinted >= 0);
2420 assert(numberresult <= numberresults + numbersize);
2421 break;
2422 case 'p':
2423 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2424 /* %p is ill-defined: ensure leading 0x. */
2425 if (numberresult[1] == 'X')
2426 numberresult[1] = 'x';
2427 else if (numberresult[1] != 'x') {
2428 memmove(numberresult + 2, numberresult,
2429 strlen(numberresult) + 1);
2430 numberresult[0] = '0';
2431 numberresult[1] = 'x';
2432 numprinted += 2;
2433 }
2434 n += numprinted;
2435 numberresult += (numprinted + 1);
2436 assert(*(numberresult - 1) == '\0');
2437 assert(*(numberresult - 2) != '\0');
2438 assert(numprinted >= 0);
2439 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 break;
2441 case 's':
2442 {
2443 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002444 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002445 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2446 if (!str)
2447 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002448 /* since PyUnicode_DecodeUTF8 returns already flexible
2449 unicode objects, there is no need to call ready on them */
2450 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002451 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002453 /* Remember the str and switch to the next slot */
2454 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002455 break;
2456 }
2457 case 'U':
2458 {
2459 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002460 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 if (PyUnicode_READY(obj) == -1)
2462 goto fail;
2463 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002464 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002466 break;
2467 }
2468 case 'V':
2469 {
2470 PyObject *obj = va_arg(count, PyObject *);
2471 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002472 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002473 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002474 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002475 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 if (PyUnicode_READY(obj) == -1)
2477 goto fail;
2478 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002479 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002480 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002481 *callresult++ = NULL;
2482 }
2483 else {
2484 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2485 if (!str_obj)
2486 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002487 if (PyUnicode_READY(str_obj)) {
2488 Py_DECREF(str_obj);
2489 goto fail;
2490 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002491 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002492 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002494 *callresult++ = str_obj;
2495 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 break;
2497 }
2498 case 'S':
2499 {
2500 PyObject *obj = va_arg(count, PyObject *);
2501 PyObject *str;
2502 assert(obj);
2503 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002507 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002509 /* Remember the str and switch to the next slot */
2510 *callresult++ = str;
2511 break;
2512 }
2513 case 'R':
2514 {
2515 PyObject *obj = va_arg(count, PyObject *);
2516 PyObject *repr;
2517 assert(obj);
2518 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002522 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002524 /* Remember the repr and switch to the next slot */
2525 *callresult++ = repr;
2526 break;
2527 }
2528 case 'A':
2529 {
2530 PyObject *obj = va_arg(count, PyObject *);
2531 PyObject *ascii;
2532 assert(obj);
2533 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002535 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002537 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002538 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 /* Remember the repr and switch to the next slot */
2540 *callresult++ = ascii;
2541 break;
2542 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002543 default:
2544 /* if we stumble upon an unknown
2545 formatting code, copy the rest of
2546 the format string to the output
2547 string. (we cannot just skip the
2548 code, since there's no way to know
2549 what's in the argument list) */
2550 n += strlen(p);
2551 goto expand;
2552 }
2553 } else
2554 n++;
2555 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002556 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002557 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002558 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002559 we don't have to resize the string.
2560 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002561 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 if (!string)
2563 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002564 kind = PyUnicode_KIND(string);
2565 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002566 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002570 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002571 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002572
2573 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002574 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2575 /* checking for == because the last argument could be a empty
2576 string, which causes i to point to end, the assert at the end of
2577 the loop */
2578 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002579
Benjamin Peterson14339b62009-01-31 16:36:08 +00002580 switch (*f) {
2581 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002582 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002583 const int ordinal = va_arg(vargs, int);
2584 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002585 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002586 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002587 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 case 'p':
2592 /* unused, since we already have the result */
2593 if (*f == 'p')
2594 (void) va_arg(vargs, void *);
2595 else
2596 (void) va_arg(vargs, int);
2597 /* extract the result from numberresults and append. */
2598 for (; *numberresult; ++i, ++numberresult)
2599 PyUnicode_WRITE(kind, data, i, *numberresult);
2600 /* skip over the separating '\0' */
2601 assert(*numberresult == '\0');
2602 numberresult++;
2603 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 break;
2605 case 's':
2606 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002607 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002609 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 size = PyUnicode_GET_LENGTH(*callresult);
2611 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002612 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002614 /* We're done with the unicode()/repr() => forget it */
2615 Py_DECREF(*callresult);
2616 /* switch to next unicode()/repr() result */
2617 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002618 break;
2619 }
2620 case 'U':
2621 {
2622 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 Py_ssize_t size;
2624 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2625 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002626 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002627 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002628 break;
2629 }
2630 case 'V':
2631 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002633 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002634 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002635 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636 size = PyUnicode_GET_LENGTH(obj);
2637 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002638 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002640 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 size = PyUnicode_GET_LENGTH(*callresult);
2642 assert(PyUnicode_KIND(*callresult) <=
2643 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002644 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002645 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002646 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002647 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002648 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 break;
2650 }
2651 case 'S':
2652 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002653 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002654 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002655 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 /* unused, since we already have the result */
2657 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002659 copy_characters(string, i, *callresult, 0, size);
2660 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002661 /* We're done with the unicode()/repr() => forget it */
2662 Py_DECREF(*callresult);
2663 /* switch to next unicode()/repr() result */
2664 ++callresult;
2665 break;
2666 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002667 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002669 break;
2670 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002671 for (; *p; ++p, ++i)
2672 PyUnicode_WRITE(kind, data, i, *p);
2673 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 goto end;
2675 }
Victor Stinner1205f272010-09-11 00:54:47 +00002676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002677 else {
2678 assert(i < PyUnicode_GET_LENGTH(string));
2679 PyUnicode_WRITE(kind, data, i++, *f);
2680 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002683
Benjamin Peterson29060642009-01-31 22:14:21 +00002684 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002685 if (callresults)
2686 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687 if (numberresults)
2688 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002689 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002690 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002691 if (callresults) {
2692 PyObject **callresult2 = callresults;
2693 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002694 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002695 ++callresult2;
2696 }
2697 PyObject_Free(callresults);
2698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002699 if (numberresults)
2700 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002701 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002702}
2703
Walter Dörwaldd2034312007-05-18 16:29:38 +00002704PyObject *
2705PyUnicode_FromFormat(const char *format, ...)
2706{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 PyObject* ret;
2708 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002709
2710#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002711 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002712#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002713 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002714#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002715 ret = PyUnicode_FromFormatV(format, vargs);
2716 va_end(vargs);
2717 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002718}
2719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002720#ifdef HAVE_WCHAR_H
2721
Victor Stinner5593d8a2010-10-02 11:11:27 +00002722/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2723 convert a Unicode object to a wide character string.
2724
Victor Stinnerd88d9832011-09-06 02:00:05 +02002725 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002726 character) required to convert the unicode object. Ignore size argument.
2727
Victor Stinnerd88d9832011-09-06 02:00:05 +02002728 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002729 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002730 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002732unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002733 wchar_t *w,
2734 Py_ssize_t size)
2735{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002736 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002737 const wchar_t *wstr;
2738
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002739 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 if (wstr == NULL)
2741 return -1;
2742
Victor Stinner5593d8a2010-10-02 11:11:27 +00002743 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002744 if (size > res)
2745 size = res + 1;
2746 else
2747 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002749 return res;
2750 }
2751 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002753}
2754
2755Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002756PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002757 wchar_t *w,
2758 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759{
2760 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002761 PyErr_BadInternalCall();
2762 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002764 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002765}
2766
Victor Stinner137c34c2010-09-29 10:25:54 +00002767wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002768PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002769 Py_ssize_t *size)
2770{
2771 wchar_t* buffer;
2772 Py_ssize_t buflen;
2773
2774 if (unicode == NULL) {
2775 PyErr_BadInternalCall();
2776 return NULL;
2777 }
2778
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002779 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 if (buflen == -1)
2781 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002782 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002783 PyErr_NoMemory();
2784 return NULL;
2785 }
2786
Victor Stinner137c34c2010-09-29 10:25:54 +00002787 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2788 if (buffer == NULL) {
2789 PyErr_NoMemory();
2790 return NULL;
2791 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002792 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002793 if (buflen == -1)
2794 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002795 if (size != NULL)
2796 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002797 return buffer;
2798}
2799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801
Alexander Belopolsky40018472011-02-26 01:02:56 +00002802PyObject *
2803PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002804{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002805 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002806 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002807 PyErr_SetString(PyExc_ValueError,
2808 "chr() arg not in range(0x110000)");
2809 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002810 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 if (ordinal < 256)
2813 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815 v = PyUnicode_New(1, ordinal);
2816 if (v == NULL)
2817 return NULL;
2818 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002819 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002820 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002821}
2822
Alexander Belopolsky40018472011-02-26 01:02:56 +00002823PyObject *
2824PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002826 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002827 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002828 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002829 if (PyUnicode_READY(obj))
2830 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002831 Py_INCREF(obj);
2832 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002833 }
2834 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002835 /* For a Unicode subtype that's not a Unicode object,
2836 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002837 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002838 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002839 PyErr_Format(PyExc_TypeError,
2840 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002841 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002842 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002843}
2844
Alexander Belopolsky40018472011-02-26 01:02:56 +00002845PyObject *
2846PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002847 const char *encoding,
2848 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002849{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002850 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002851 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002852
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002854 PyErr_BadInternalCall();
2855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002857
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002858 /* Decoding bytes objects is the most common case and should be fast */
2859 if (PyBytes_Check(obj)) {
2860 if (PyBytes_GET_SIZE(obj) == 0) {
2861 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002862 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002863 }
2864 else {
2865 v = PyUnicode_Decode(
2866 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2867 encoding, errors);
2868 }
2869 return v;
2870 }
2871
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002872 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002873 PyErr_SetString(PyExc_TypeError,
2874 "decoding str is not supported");
2875 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002876 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002877
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002878 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2879 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2880 PyErr_Format(PyExc_TypeError,
2881 "coercing to str: need bytes, bytearray "
2882 "or buffer-like object, %.80s found",
2883 Py_TYPE(obj)->tp_name);
2884 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002885 }
Tim Petersced69f82003-09-16 20:30:58 +00002886
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002887 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002888 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002889 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002890 }
Tim Petersced69f82003-09-16 20:30:58 +00002891 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002892 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002893
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002894 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002895 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896}
2897
Victor Stinner600d3be2010-06-10 12:00:55 +00002898/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002899 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2900 1 on success. */
2901static int
2902normalize_encoding(const char *encoding,
2903 char *lower,
2904 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002906 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002907 char *l;
2908 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002909
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002910 if (encoding == NULL) {
2911 strcpy(lower, "utf-8");
2912 return 1;
2913 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002914 e = encoding;
2915 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002916 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002917 while (*e) {
2918 if (l == l_end)
2919 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002920 if (Py_ISUPPER(*e)) {
2921 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002922 }
2923 else if (*e == '_') {
2924 *l++ = '-';
2925 e++;
2926 }
2927 else {
2928 *l++ = *e++;
2929 }
2930 }
2931 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002932 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002933}
2934
Alexander Belopolsky40018472011-02-26 01:02:56 +00002935PyObject *
2936PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002937 Py_ssize_t size,
2938 const char *encoding,
2939 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002940{
2941 PyObject *buffer = NULL, *unicode;
2942 Py_buffer info;
2943 char lower[11]; /* Enough for any encoding shortcut */
2944
Fred Drakee4315f52000-05-09 19:53:39 +00002945 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002946 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002947 if ((strcmp(lower, "utf-8") == 0) ||
2948 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002949 return PyUnicode_DecodeUTF8(s, size, errors);
2950 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002951 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002952 (strcmp(lower, "iso-8859-1") == 0))
2953 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002954#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002955 else if (strcmp(lower, "mbcs") == 0)
2956 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002957#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002958 else if (strcmp(lower, "ascii") == 0)
2959 return PyUnicode_DecodeASCII(s, size, errors);
2960 else if (strcmp(lower, "utf-16") == 0)
2961 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2962 else if (strcmp(lower, "utf-32") == 0)
2963 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965
2966 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002967 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002968 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002969 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002970 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 if (buffer == NULL)
2972 goto onError;
2973 unicode = PyCodec_Decode(buffer, encoding, errors);
2974 if (unicode == NULL)
2975 goto onError;
2976 if (!PyUnicode_Check(unicode)) {
2977 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002978 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002979 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 Py_DECREF(unicode);
2981 goto onError;
2982 }
2983 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002984 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002985
Benjamin Peterson29060642009-01-31 22:14:21 +00002986 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987 Py_XDECREF(buffer);
2988 return NULL;
2989}
2990
Alexander Belopolsky40018472011-02-26 01:02:56 +00002991PyObject *
2992PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002993 const char *encoding,
2994 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002995{
2996 PyObject *v;
2997
2998 if (!PyUnicode_Check(unicode)) {
2999 PyErr_BadArgument();
3000 goto onError;
3001 }
3002
3003 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003004 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003005
3006 /* Decode via the codec registry */
3007 v = PyCodec_Decode(unicode, encoding, errors);
3008 if (v == NULL)
3009 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003010 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003011
Benjamin Peterson29060642009-01-31 22:14:21 +00003012 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003013 return NULL;
3014}
3015
Alexander Belopolsky40018472011-02-26 01:02:56 +00003016PyObject *
3017PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003018 const char *encoding,
3019 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003020{
3021 PyObject *v;
3022
3023 if (!PyUnicode_Check(unicode)) {
3024 PyErr_BadArgument();
3025 goto onError;
3026 }
3027
3028 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003030
3031 /* Decode via the codec registry */
3032 v = PyCodec_Decode(unicode, encoding, errors);
3033 if (v == NULL)
3034 goto onError;
3035 if (!PyUnicode_Check(v)) {
3036 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003037 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003038 Py_TYPE(v)->tp_name);
3039 Py_DECREF(v);
3040 goto onError;
3041 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003042 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003043
Benjamin Peterson29060642009-01-31 22:14:21 +00003044 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003045 return NULL;
3046}
3047
Alexander Belopolsky40018472011-02-26 01:02:56 +00003048PyObject *
3049PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003050 Py_ssize_t size,
3051 const char *encoding,
3052 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053{
3054 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003055
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056 unicode = PyUnicode_FromUnicode(s, size);
3057 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3060 Py_DECREF(unicode);
3061 return v;
3062}
3063
Alexander Belopolsky40018472011-02-26 01:02:56 +00003064PyObject *
3065PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003066 const char *encoding,
3067 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003068{
3069 PyObject *v;
3070
3071 if (!PyUnicode_Check(unicode)) {
3072 PyErr_BadArgument();
3073 goto onError;
3074 }
3075
3076 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003078
3079 /* Encode via the codec registry */
3080 v = PyCodec_Encode(unicode, encoding, errors);
3081 if (v == NULL)
3082 goto onError;
3083 return v;
3084
Benjamin Peterson29060642009-01-31 22:14:21 +00003085 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003086 return NULL;
3087}
3088
Victor Stinnerad158722010-10-27 00:25:46 +00003089PyObject *
3090PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003091{
Victor Stinner99b95382011-07-04 14:23:54 +02003092#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003093 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003094#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003095 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003096#else
Victor Stinner793b5312011-04-27 00:24:21 +02003097 PyInterpreterState *interp = PyThreadState_GET()->interp;
3098 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3099 cannot use it to encode and decode filenames before it is loaded. Load
3100 the Python codec requires to encode at least its own filename. Use the C
3101 version of the locale codec until the codec registry is initialized and
3102 the Python codec is loaded.
3103
3104 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3105 cannot only rely on it: check also interp->fscodec_initialized for
3106 subinterpreters. */
3107 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003108 return PyUnicode_AsEncodedString(unicode,
3109 Py_FileSystemDefaultEncoding,
3110 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003111 }
3112 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003113 /* locale encoding with surrogateescape */
3114 wchar_t *wchar;
3115 char *bytes;
3116 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003117 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003118
3119 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3120 if (wchar == NULL)
3121 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003122 bytes = _Py_wchar2char(wchar, &error_pos);
3123 if (bytes == NULL) {
3124 if (error_pos != (size_t)-1) {
3125 char *errmsg = strerror(errno);
3126 PyObject *exc = NULL;
3127 if (errmsg == NULL)
3128 errmsg = "Py_wchar2char() failed";
3129 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003130 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003131 error_pos, error_pos+1,
3132 errmsg);
3133 Py_XDECREF(exc);
3134 }
3135 else
3136 PyErr_NoMemory();
3137 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003138 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003139 }
3140 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003141
3142 bytes_obj = PyBytes_FromString(bytes);
3143 PyMem_Free(bytes);
3144 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003145 }
Victor Stinnerad158722010-10-27 00:25:46 +00003146#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003147}
3148
Alexander Belopolsky40018472011-02-26 01:02:56 +00003149PyObject *
3150PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003151 const char *encoding,
3152 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153{
3154 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003155 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003156
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157 if (!PyUnicode_Check(unicode)) {
3158 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 }
Fred Drakee4315f52000-05-09 19:53:39 +00003161
Fred Drakee4315f52000-05-09 19:53:39 +00003162 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003163 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003164 if ((strcmp(lower, "utf-8") == 0) ||
3165 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003166 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003167 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003168 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003169 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003170 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003171 }
Victor Stinner37296e82010-06-10 13:36:23 +00003172 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003173 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003174 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003175 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003176#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003177 else if (strcmp(lower, "mbcs") == 0)
3178 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003179#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003180 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003181 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183
3184 /* Encode via the codec registry */
3185 v = PyCodec_Encode(unicode, encoding, errors);
3186 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003187 return NULL;
3188
3189 /* The normal path */
3190 if (PyBytes_Check(v))
3191 return v;
3192
3193 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003194 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003195 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003196 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003197
3198 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3199 "encoder %s returned bytearray instead of bytes",
3200 encoding);
3201 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003202 Py_DECREF(v);
3203 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003204 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003205
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003206 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3207 Py_DECREF(v);
3208 return b;
3209 }
3210
3211 PyErr_Format(PyExc_TypeError,
3212 "encoder did not return a bytes object (type=%.400s)",
3213 Py_TYPE(v)->tp_name);
3214 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003215 return NULL;
3216}
3217
Alexander Belopolsky40018472011-02-26 01:02:56 +00003218PyObject *
3219PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003220 const char *encoding,
3221 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003222{
3223 PyObject *v;
3224
3225 if (!PyUnicode_Check(unicode)) {
3226 PyErr_BadArgument();
3227 goto onError;
3228 }
3229
3230 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003231 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003232
3233 /* Encode via the codec registry */
3234 v = PyCodec_Encode(unicode, encoding, errors);
3235 if (v == NULL)
3236 goto onError;
3237 if (!PyUnicode_Check(v)) {
3238 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003239 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003240 Py_TYPE(v)->tp_name);
3241 Py_DECREF(v);
3242 goto onError;
3243 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003245
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 return NULL;
3248}
3249
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003250PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003251PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003252 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003253 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3254}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003255
Christian Heimes5894ba72007-11-04 11:43:14 +00003256PyObject*
3257PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3258{
Victor Stinner99b95382011-07-04 14:23:54 +02003259#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003260 return PyUnicode_DecodeMBCS(s, size, NULL);
3261#elif defined(__APPLE__)
3262 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3263#else
Victor Stinner793b5312011-04-27 00:24:21 +02003264 PyInterpreterState *interp = PyThreadState_GET()->interp;
3265 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3266 cannot use it to encode and decode filenames before it is loaded. Load
3267 the Python codec requires to encode at least its own filename. Use the C
3268 version of the locale codec until the codec registry is initialized and
3269 the Python codec is loaded.
3270
3271 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3272 cannot only rely on it: check also interp->fscodec_initialized for
3273 subinterpreters. */
3274 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003275 return PyUnicode_Decode(s, size,
3276 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003277 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003278 }
3279 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003280 /* locale encoding with surrogateescape */
3281 wchar_t *wchar;
3282 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003283 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003284
3285 if (s[size] != '\0' || size != strlen(s)) {
3286 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3287 return NULL;
3288 }
3289
Victor Stinner168e1172010-10-16 23:16:16 +00003290 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003291 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003292 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003293
Victor Stinner168e1172010-10-16 23:16:16 +00003294 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003295 PyMem_Free(wchar);
3296 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003297 }
Victor Stinnerad158722010-10-27 00:25:46 +00003298#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003299}
3300
Martin v. Löwis011e8422009-05-05 04:43:17 +00003301
3302int
3303PyUnicode_FSConverter(PyObject* arg, void* addr)
3304{
3305 PyObject *output = NULL;
3306 Py_ssize_t size;
3307 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003308 if (arg == NULL) {
3309 Py_DECREF(*(PyObject**)addr);
3310 return 1;
3311 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003312 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003313 output = arg;
3314 Py_INCREF(output);
3315 }
3316 else {
3317 arg = PyUnicode_FromObject(arg);
3318 if (!arg)
3319 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003320 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003321 Py_DECREF(arg);
3322 if (!output)
3323 return 0;
3324 if (!PyBytes_Check(output)) {
3325 Py_DECREF(output);
3326 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3327 return 0;
3328 }
3329 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003330 size = PyBytes_GET_SIZE(output);
3331 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003332 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003333 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003334 Py_DECREF(output);
3335 return 0;
3336 }
3337 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003338 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003339}
3340
3341
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003342int
3343PyUnicode_FSDecoder(PyObject* arg, void* addr)
3344{
3345 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003346 if (arg == NULL) {
3347 Py_DECREF(*(PyObject**)addr);
3348 return 1;
3349 }
3350 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351 if (PyUnicode_READY(arg))
3352 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003353 output = arg;
3354 Py_INCREF(output);
3355 }
3356 else {
3357 arg = PyBytes_FromObject(arg);
3358 if (!arg)
3359 return 0;
3360 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3361 PyBytes_GET_SIZE(arg));
3362 Py_DECREF(arg);
3363 if (!output)
3364 return 0;
3365 if (!PyUnicode_Check(output)) {
3366 Py_DECREF(output);
3367 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3368 return 0;
3369 }
3370 }
Victor Stinner065836e2011-10-27 01:56:33 +02003371 if (PyUnicode_READY(output) < 0) {
3372 Py_DECREF(output);
3373 return 0;
3374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003375 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003376 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003377 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3378 Py_DECREF(output);
3379 return 0;
3380 }
3381 *(PyObject**)addr = output;
3382 return Py_CLEANUP_SUPPORTED;
3383}
3384
3385
Martin v. Löwis5b222132007-06-10 09:51:05 +00003386char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003387PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003388{
Christian Heimesf3863112007-11-22 07:46:41 +00003389 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003390
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003391 if (!PyUnicode_Check(unicode)) {
3392 PyErr_BadArgument();
3393 return NULL;
3394 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003395 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003396 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003397
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003398 if (PyUnicode_UTF8(unicode) == NULL) {
3399 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003400 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3401 if (bytes == NULL)
3402 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003403 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3404 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003405 Py_DECREF(bytes);
3406 return NULL;
3407 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003408 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3409 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3410 PyBytes_AS_STRING(bytes),
3411 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003412 Py_DECREF(bytes);
3413 }
3414
3415 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003416 *psize = PyUnicode_UTF8_LENGTH(unicode);
3417 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003418}
3419
3420char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003421PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003422{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003423 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3424}
3425
3426#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003427static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003428#endif
3429
3430
3431Py_UNICODE *
3432PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003434 const unsigned char *one_byte;
3435#if SIZEOF_WCHAR_T == 4
3436 const Py_UCS2 *two_bytes;
3437#else
3438 const Py_UCS4 *four_bytes;
3439 const Py_UCS4 *ucs4_end;
3440 Py_ssize_t num_surrogates;
3441#endif
3442 wchar_t *w;
3443 wchar_t *wchar_end;
3444
3445 if (!PyUnicode_Check(unicode)) {
3446 PyErr_BadArgument();
3447 return NULL;
3448 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003449 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003450 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003451 assert(_PyUnicode_KIND(unicode) != 0);
3452 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003453
3454#ifdef Py_DEBUG
3455 ++unicode_as_unicode_calls;
3456#endif
3457
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003458 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003459#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003460 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3461 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003462 num_surrogates = 0;
3463
3464 for (; four_bytes < ucs4_end; ++four_bytes) {
3465 if (*four_bytes > 0xFFFF)
3466 ++num_surrogates;
3467 }
3468
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003469 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3470 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3471 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003472 PyErr_NoMemory();
3473 return NULL;
3474 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003475 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003476
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003477 w = _PyUnicode_WSTR(unicode);
3478 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3479 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003480 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3481 if (*four_bytes > 0xFFFF) {
3482 /* encode surrogate pair in this case */
3483 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3484 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3485 }
3486 else
3487 *w = *four_bytes;
3488
3489 if (w > wchar_end) {
3490 assert(0 && "Miscalculated string end");
3491 }
3492 }
3493 *w = 0;
3494#else
3495 /* sizeof(wchar_t) == 4 */
3496 Py_FatalError("Impossible unicode object state, wstr and str "
3497 "should share memory already.");
3498 return NULL;
3499#endif
3500 }
3501 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003502 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3503 (_PyUnicode_LENGTH(unicode) + 1));
3504 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003505 PyErr_NoMemory();
3506 return NULL;
3507 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003508 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3509 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3510 w = _PyUnicode_WSTR(unicode);
3511 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003512
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003513 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3514 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003515 for (; w < wchar_end; ++one_byte, ++w)
3516 *w = *one_byte;
3517 /* null-terminate the wstr */
3518 *w = 0;
3519 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003520 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003521#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003522 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003523 for (; w < wchar_end; ++two_bytes, ++w)
3524 *w = *two_bytes;
3525 /* null-terminate the wstr */
3526 *w = 0;
3527#else
3528 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003529 PyObject_FREE(_PyUnicode_WSTR(unicode));
3530 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003531 Py_FatalError("Impossible unicode object state, wstr "
3532 "and str should share memory already.");
3533 return NULL;
3534#endif
3535 }
3536 else {
3537 assert(0 && "This should never happen.");
3538 }
3539 }
3540 }
3541 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003542 *size = PyUnicode_WSTR_LENGTH(unicode);
3543 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003544}
3545
Alexander Belopolsky40018472011-02-26 01:02:56 +00003546Py_UNICODE *
3547PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003549 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550}
3551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552
Alexander Belopolsky40018472011-02-26 01:02:56 +00003553Py_ssize_t
3554PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555{
3556 if (!PyUnicode_Check(unicode)) {
3557 PyErr_BadArgument();
3558 goto onError;
3559 }
3560 return PyUnicode_GET_SIZE(unicode);
3561
Benjamin Peterson29060642009-01-31 22:14:21 +00003562 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 return -1;
3564}
3565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003566Py_ssize_t
3567PyUnicode_GetLength(PyObject *unicode)
3568{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003569 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003570 PyErr_BadArgument();
3571 return -1;
3572 }
3573
3574 return PyUnicode_GET_LENGTH(unicode);
3575}
3576
3577Py_UCS4
3578PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3579{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003580 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3581 PyErr_BadArgument();
3582 return (Py_UCS4)-1;
3583 }
3584 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3585 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003586 return (Py_UCS4)-1;
3587 }
3588 return PyUnicode_READ_CHAR(unicode, index);
3589}
3590
3591int
3592PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3593{
3594 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003595 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003596 return -1;
3597 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003598 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3599 PyErr_SetString(PyExc_IndexError, "string index out of range");
3600 return -1;
3601 }
3602 if (_PyUnicode_Dirty(unicode))
3603 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003604 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3605 index, ch);
3606 return 0;
3607}
3608
Alexander Belopolsky40018472011-02-26 01:02:56 +00003609const char *
3610PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003611{
Victor Stinner42cb4622010-09-01 19:39:01 +00003612 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003613}
3614
Victor Stinner554f3f02010-06-16 23:33:54 +00003615/* create or adjust a UnicodeDecodeError */
3616static void
3617make_decode_exception(PyObject **exceptionObject,
3618 const char *encoding,
3619 const char *input, Py_ssize_t length,
3620 Py_ssize_t startpos, Py_ssize_t endpos,
3621 const char *reason)
3622{
3623 if (*exceptionObject == NULL) {
3624 *exceptionObject = PyUnicodeDecodeError_Create(
3625 encoding, input, length, startpos, endpos, reason);
3626 }
3627 else {
3628 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3629 goto onError;
3630 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3631 goto onError;
3632 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3633 goto onError;
3634 }
3635 return;
3636
3637onError:
3638 Py_DECREF(*exceptionObject);
3639 *exceptionObject = NULL;
3640}
3641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003642/* error handling callback helper:
3643 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003644 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645 and adjust various state variables.
3646 return 0 on success, -1 on error
3647*/
3648
Alexander Belopolsky40018472011-02-26 01:02:56 +00003649static int
3650unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003651 const char *encoding, const char *reason,
3652 const char **input, const char **inend, Py_ssize_t *startinpos,
3653 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003654 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003656 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657
3658 PyObject *restuple = NULL;
3659 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003660 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003661 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003662 Py_ssize_t requiredsize;
3663 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003664 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665 int res = -1;
3666
Victor Stinner596a6c42011-11-09 00:02:18 +01003667 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3668 outsize = PyUnicode_GET_LENGTH(*output);
3669 else
3670 outsize = _PyUnicode_WSTR_LENGTH(*output);
3671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003672 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 *errorHandler = PyCodec_LookupError(errors);
3674 if (*errorHandler == NULL)
3675 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 }
3677
Victor Stinner554f3f02010-06-16 23:33:54 +00003678 make_decode_exception(exceptionObject,
3679 encoding,
3680 *input, *inend - *input,
3681 *startinpos, *endinpos,
3682 reason);
3683 if (*exceptionObject == NULL)
3684 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685
3686 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3687 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003689 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003690 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 }
3693 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003695 if (PyUnicode_READY(repunicode) < 0)
3696 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003697
3698 /* Copy back the bytes variables, which might have been modified by the
3699 callback */
3700 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3701 if (!inputobj)
3702 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003703 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003704 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003705 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003706 *input = PyBytes_AS_STRING(inputobj);
3707 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003708 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003709 /* we can DECREF safely, as the exception has another reference,
3710 so the object won't go away. */
3711 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003713 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003714 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003715 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003716 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3717 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003718 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003719
Victor Stinner596a6c42011-11-09 00:02:18 +01003720 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3721 /* need more space? (at least enough for what we
3722 have+the replacement+the rest of the string (starting
3723 at the new input position), so we won't have to check space
3724 when there are no errors in the rest of the string) */
3725 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3726 requiredsize = *outpos + replen + insize-newpos;
3727 if (requiredsize > outsize) {
3728 if (requiredsize<2*outsize)
3729 requiredsize = 2*outsize;
3730 if (unicode_resize(output, requiredsize) < 0)
3731 goto onError;
3732 }
3733 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003734 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003735 copy_characters(*output, *outpos, repunicode, 0, replen);
3736 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003737 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003738 else {
3739 wchar_t *repwstr;
3740 Py_ssize_t repwlen;
3741 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3742 if (repwstr == NULL)
3743 goto onError;
3744 /* need more space? (at least enough for what we
3745 have+the replacement+the rest of the string (starting
3746 at the new input position), so we won't have to check space
3747 when there are no errors in the rest of the string) */
3748 requiredsize = *outpos + repwlen + insize-newpos;
3749 if (requiredsize > outsize) {
3750 if (requiredsize < 2*outsize)
3751 requiredsize = 2*outsize;
3752 if (unicode_resize(output, requiredsize) < 0)
3753 goto onError;
3754 }
3755 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3756 *outpos += repwlen;
3757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003758 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003759 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003760
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 /* we made it! */
3762 res = 0;
3763
Benjamin Peterson29060642009-01-31 22:14:21 +00003764 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003765 Py_XDECREF(restuple);
3766 return res;
3767}
3768
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003769/* --- UTF-7 Codec -------------------------------------------------------- */
3770
Antoine Pitrou244651a2009-05-04 18:56:13 +00003771/* See RFC2152 for details. We encode conservatively and decode liberally. */
3772
3773/* Three simple macros defining base-64. */
3774
3775/* Is c a base-64 character? */
3776
3777#define IS_BASE64(c) \
3778 (((c) >= 'A' && (c) <= 'Z') || \
3779 ((c) >= 'a' && (c) <= 'z') || \
3780 ((c) >= '0' && (c) <= '9') || \
3781 (c) == '+' || (c) == '/')
3782
3783/* given that c is a base-64 character, what is its base-64 value? */
3784
3785#define FROM_BASE64(c) \
3786 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3787 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3788 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3789 (c) == '+' ? 62 : 63)
3790
3791/* What is the base-64 character of the bottom 6 bits of n? */
3792
3793#define TO_BASE64(n) \
3794 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3795
3796/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3797 * decoded as itself. We are permissive on decoding; the only ASCII
3798 * byte not decoding to itself is the + which begins a base64
3799 * string. */
3800
3801#define DECODE_DIRECT(c) \
3802 ((c) <= 127 && (c) != '+')
3803
3804/* The UTF-7 encoder treats ASCII characters differently according to
3805 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3806 * the above). See RFC2152. This array identifies these different
3807 * sets:
3808 * 0 : "Set D"
3809 * alphanumeric and '(),-./:?
3810 * 1 : "Set O"
3811 * !"#$%&*;<=>@[]^_`{|}
3812 * 2 : "whitespace"
3813 * ht nl cr sp
3814 * 3 : special (must be base64 encoded)
3815 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3816 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003817
Tim Petersced69f82003-09-16 20:30:58 +00003818static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003819char utf7_category[128] = {
3820/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3821 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3822/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3823 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3824/* sp ! " # $ % & ' ( ) * + , - . / */
3825 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3826/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3828/* @ A B C D E F G H I J K L M N O */
3829 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3830/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3832/* ` a b c d e f g h i j k l m n o */
3833 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3834/* p q r s t u v w x y z { | } ~ del */
3835 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003836};
3837
Antoine Pitrou244651a2009-05-04 18:56:13 +00003838/* ENCODE_DIRECT: this character should be encoded as itself. The
3839 * answer depends on whether we are encoding set O as itself, and also
3840 * on whether we are encoding whitespace as itself. RFC2152 makes it
3841 * clear that the answers to these questions vary between
3842 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003843
Antoine Pitrou244651a2009-05-04 18:56:13 +00003844#define ENCODE_DIRECT(c, directO, directWS) \
3845 ((c) < 128 && (c) > 0 && \
3846 ((utf7_category[(c)] == 0) || \
3847 (directWS && (utf7_category[(c)] == 2)) || \
3848 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003849
Alexander Belopolsky40018472011-02-26 01:02:56 +00003850PyObject *
3851PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003852 Py_ssize_t size,
3853 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003854{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003855 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3856}
3857
Antoine Pitrou244651a2009-05-04 18:56:13 +00003858/* The decoder. The only state we preserve is our read position,
3859 * i.e. how many characters we have consumed. So if we end in the
3860 * middle of a shift sequence we have to back off the read position
3861 * and the output to the beginning of the sequence, otherwise we lose
3862 * all the shift state (seen bits, number of bits seen, high
3863 * surrogate). */
3864
Alexander Belopolsky40018472011-02-26 01:02:56 +00003865PyObject *
3866PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003867 Py_ssize_t size,
3868 const char *errors,
3869 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003871 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003872 Py_ssize_t startinpos;
3873 Py_ssize_t endinpos;
3874 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003875 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003876 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003877 const char *errmsg = "";
3878 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003879 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003880 unsigned int base64bits = 0;
3881 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003882 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003883 PyObject *errorHandler = NULL;
3884 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003885
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003886 /* Start off assuming it's all ASCII. Widen later as necessary. */
3887 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003888 if (!unicode)
3889 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003890 if (size == 0) {
3891 if (consumed)
3892 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003893 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003894 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003895
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003896 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003897 e = s + size;
3898
3899 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003900 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003901 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003902 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003903
Antoine Pitrou244651a2009-05-04 18:56:13 +00003904 if (inShift) { /* in a base-64 section */
3905 if (IS_BASE64(ch)) { /* consume a base-64 character */
3906 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3907 base64bits += 6;
3908 s++;
3909 if (base64bits >= 16) {
3910 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003911 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003912 base64bits -= 16;
3913 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3914 if (surrogate) {
3915 /* expecting a second surrogate */
3916 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003917 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3918 | (outCh & 0x3FF)) + 0x10000;
3919 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3920 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003921 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003922 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003923 }
3924 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003925 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3926 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003927 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003928 }
3929 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003930 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003931 /* first surrogate */
3932 surrogate = outCh;
3933 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003934 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003935 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3936 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003937 }
3938 }
3939 }
3940 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003941 inShift = 0;
3942 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003943 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003944 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3945 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003946 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003947 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003948 if (base64bits > 0) { /* left-over bits */
3949 if (base64bits >= 6) {
3950 /* We've seen at least one base-64 character */
3951 errmsg = "partial character in shift sequence";
3952 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003953 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003954 else {
3955 /* Some bits remain; they should be zero */
3956 if (base64buffer != 0) {
3957 errmsg = "non-zero padding bits in shift sequence";
3958 goto utf7Error;
3959 }
3960 }
3961 }
3962 if (ch != '-') {
3963 /* '-' is absorbed; other terminating
3964 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003965 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3966 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003967 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003968 }
3969 }
3970 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003971 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003972 s++; /* consume '+' */
3973 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003974 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003975 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3976 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003977 }
3978 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003979 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003980 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003981 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003982 }
3983 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003984 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003985 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3986 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003987 s++;
3988 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003989 else {
3990 startinpos = s-starts;
3991 s++;
3992 errmsg = "unexpected special character";
3993 goto utf7Error;
3994 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003995 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003996utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003997 endinpos = s-starts;
3998 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003999 errors, &errorHandler,
4000 "utf7", errmsg,
4001 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004002 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004004 }
4005
Antoine Pitrou244651a2009-05-04 18:56:13 +00004006 /* end of string */
4007
4008 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4009 /* if we're in an inconsistent state, that's an error */
4010 if (surrogate ||
4011 (base64bits >= 6) ||
4012 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004013 endinpos = size;
4014 if (unicode_decode_call_errorhandler(
4015 errors, &errorHandler,
4016 "utf7", "unterminated shift sequence",
4017 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004018 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004019 goto onError;
4020 if (s < e)
4021 goto restart;
4022 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004023 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004024
4025 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004026 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004027 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004028 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004029 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004030 }
4031 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004032 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004033 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004034 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004035
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004036 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004037 goto onError;
4038
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004039 Py_XDECREF(errorHandler);
4040 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004041 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004042
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004044 Py_XDECREF(errorHandler);
4045 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004046 Py_DECREF(unicode);
4047 return NULL;
4048}
4049
4050
Alexander Belopolsky40018472011-02-26 01:02:56 +00004051PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004052_PyUnicode_EncodeUTF7(PyObject *str,
4053 int base64SetO,
4054 int base64WhiteSpace,
4055 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004056{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004057 int kind;
4058 void *data;
4059 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004060 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004061 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004062 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004063 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004064 unsigned int base64bits = 0;
4065 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004066 char * out;
4067 char * start;
4068
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004069 if (PyUnicode_READY(str) < 0)
4070 return NULL;
4071 kind = PyUnicode_KIND(str);
4072 data = PyUnicode_DATA(str);
4073 len = PyUnicode_GET_LENGTH(str);
4074
4075 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004076 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004077
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004078 /* It might be possible to tighten this worst case */
4079 allocated = 8 * len;
4080 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004081 return PyErr_NoMemory();
4082
Antoine Pitrou244651a2009-05-04 18:56:13 +00004083 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004084 if (v == NULL)
4085 return NULL;
4086
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004087 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004088 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004089 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004090
Antoine Pitrou244651a2009-05-04 18:56:13 +00004091 if (inShift) {
4092 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4093 /* shifting out */
4094 if (base64bits) { /* output remaining bits */
4095 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4096 base64buffer = 0;
4097 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004098 }
4099 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004100 /* Characters not in the BASE64 set implicitly unshift the sequence
4101 so no '-' is required, except if the character is itself a '-' */
4102 if (IS_BASE64(ch) || ch == '-') {
4103 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004104 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004105 *out++ = (char) ch;
4106 }
4107 else {
4108 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004109 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004110 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004111 else { /* not in a shift sequence */
4112 if (ch == '+') {
4113 *out++ = '+';
4114 *out++ = '-';
4115 }
4116 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4117 *out++ = (char) ch;
4118 }
4119 else {
4120 *out++ = '+';
4121 inShift = 1;
4122 goto encode_char;
4123 }
4124 }
4125 continue;
4126encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004127 if (ch >= 0x10000) {
4128 /* code first surrogate */
4129 base64bits += 16;
4130 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4131 while (base64bits >= 6) {
4132 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4133 base64bits -= 6;
4134 }
4135 /* prepare second surrogate */
4136 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4137 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004138 base64bits += 16;
4139 base64buffer = (base64buffer << 16) | ch;
4140 while (base64bits >= 6) {
4141 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4142 base64bits -= 6;
4143 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004144 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004145 if (base64bits)
4146 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4147 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004148 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004149 if (_PyBytes_Resize(&v, out - start) < 0)
4150 return NULL;
4151 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004152}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004153PyObject *
4154PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4155 Py_ssize_t size,
4156 int base64SetO,
4157 int base64WhiteSpace,
4158 const char *errors)
4159{
4160 PyObject *result;
4161 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4162 if (tmp == NULL)
4163 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004164 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004165 base64WhiteSpace, errors);
4166 Py_DECREF(tmp);
4167 return result;
4168}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004169
Antoine Pitrou244651a2009-05-04 18:56:13 +00004170#undef IS_BASE64
4171#undef FROM_BASE64
4172#undef TO_BASE64
4173#undef DECODE_DIRECT
4174#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004175
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176/* --- UTF-8 Codec -------------------------------------------------------- */
4177
Tim Petersced69f82003-09-16 20:30:58 +00004178static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004180 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4181 illegal prefix. See RFC 3629 for details */
4182 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4183 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4192 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4194 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4195 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4196 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4197 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198};
4199
Alexander Belopolsky40018472011-02-26 01:02:56 +00004200PyObject *
4201PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004202 Py_ssize_t size,
4203 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204{
Walter Dörwald69652032004-09-07 20:24:22 +00004205 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4206}
4207
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004208#include "stringlib/ucs1lib.h"
4209#include "stringlib/codecs.h"
4210#include "stringlib/undef.h"
4211
4212#include "stringlib/ucs2lib.h"
4213#include "stringlib/codecs.h"
4214#include "stringlib/undef.h"
4215
4216#include "stringlib/ucs4lib.h"
4217#include "stringlib/codecs.h"
4218#include "stringlib/undef.h"
4219
Antoine Pitrouab868312009-01-10 15:40:25 +00004220/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4221#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4222
4223/* Mask to quickly check whether a C 'long' contains a
4224 non-ASCII, UTF8-encoded char. */
4225#if (SIZEOF_LONG == 8)
4226# define ASCII_CHAR_MASK 0x8080808080808080L
4227#elif (SIZEOF_LONG == 4)
4228# define ASCII_CHAR_MASK 0x80808080L
4229#else
4230# error C 'long' size should be either 4 or 8!
4231#endif
4232
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004233/* Scans a UTF-8 string and returns the maximum character to be expected
4234 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004235
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004236 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004237 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 */
4239static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004240utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4241 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004243 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004244 const unsigned char *p = (const unsigned char *)s;
4245 const unsigned char *end = p + string_size;
4246 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004247
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004248 assert(unicode_size != NULL);
4249
4250 /* By having a cascade of independent loops which fallback onto each
4251 other, we minimize the amount of work done in the average loop
4252 iteration, and we also maximize the CPU's ability to predict
4253 branches correctly (because a given condition will have always the
4254 same boolean outcome except perhaps in the last iteration of the
4255 corresponding loop).
4256 In the general case this brings us rather close to decoding
4257 performance pre-PEP 393, despite the two-pass decoding.
4258
4259 Note that the pure ASCII loop is not duplicated once a non-ASCII
4260 character has been encountered. It is actually a pessimization (by
4261 a significant factor) to use this loop on text with many non-ASCII
4262 characters, and it is important to avoid bad performance on valid
4263 utf-8 data (invalid utf-8 being a different can of worms).
4264 */
4265
4266 /* ASCII */
4267 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 /* Only check value if it's not a ASCII char... */
4269 if (*p < 0x80) {
4270 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4271 an explanation. */
4272 if (!((size_t) p & LONG_PTR_MASK)) {
4273 /* Help register allocation */
4274 register const unsigned char *_p = p;
4275 while (_p < aligned_end) {
4276 unsigned long value = *(unsigned long *) _p;
4277 if (value & ASCII_CHAR_MASK)
4278 break;
4279 _p += SIZEOF_LONG;
4280 char_count += SIZEOF_LONG;
4281 }
4282 p = _p;
4283 if (p == end)
4284 break;
4285 }
4286 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004287 if (*p < 0x80)
4288 ++char_count;
4289 else
4290 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004292 *unicode_size = char_count;
4293 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004295_ucs1loop:
4296 for (; p < end; ++p) {
4297 if (*p < 0xc4)
4298 char_count += ((*p & 0xc0) != 0x80);
4299 else
4300 goto _ucs2loop;
4301 }
4302 *unicode_size = char_count;
4303 return 255;
4304
4305_ucs2loop:
4306 for (; p < end; ++p) {
4307 if (*p < 0xf0)
4308 char_count += ((*p & 0xc0) != 0x80);
4309 else
4310 goto _ucs4loop;
4311 }
4312 *unicode_size = char_count;
4313 return 65535;
4314
4315_ucs4loop:
4316 for (; p < end; ++p) {
4317 char_count += ((*p & 0xc0) != 0x80);
4318 }
4319 *unicode_size = char_count;
4320 return 65537;
4321}
4322
4323/* Called when we encountered some error that wasn't detected in the original
4324 scan, e.g. an encoded surrogate character. The original maxchar computation
4325 may have been incorrect, so redo it. */
4326static int
4327refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4328{
4329 PyObject *tmp;
4330 Py_ssize_t k, maxchar;
4331 for (k = 0, maxchar = 0; k < n; k++)
4332 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4333 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4334 if (tmp == NULL)
4335 return -1;
4336 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4337 Py_DECREF(*unicode);
4338 *unicode = tmp;
4339 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004340}
4341
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004342/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4343 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4344 onError. Potential resizing overallocates, so the result needs to shrink
4345 at the end.
4346*/
4347#define WRITE_MAYBE_FAIL(index, value) \
4348 do { \
4349 if (has_errors) { \
4350 Py_ssize_t pos = index; \
4351 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4352 unicode_resize(&unicode, pos + pos/8) < 0) \
4353 goto onError; \
4354 if (unicode_putchar(&unicode, &pos, value) < 0) \
4355 goto onError; \
4356 } \
4357 else \
4358 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004359 } while (0)
4360
Alexander Belopolsky40018472011-02-26 01:02:56 +00004361PyObject *
4362PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004363 Py_ssize_t size,
4364 const char *errors,
4365 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004366{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004367 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004369 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004370 Py_ssize_t startinpos;
4371 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004372 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004373 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004374 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004375 PyObject *errorHandler = NULL;
4376 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004377 Py_UCS4 maxchar = 0;
4378 Py_ssize_t unicode_size;
4379 Py_ssize_t i;
4380 int kind;
4381 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004382 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383
Walter Dörwald69652032004-09-07 20:24:22 +00004384 if (size == 0) {
4385 if (consumed)
4386 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004387 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004388 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004389 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004390 /* When the string is ASCII only, just use memcpy and return.
4391 unicode_size may be != size if there is an incomplete UTF-8
4392 sequence at the end of the ASCII block. */
4393 if (maxchar < 128 && size == unicode_size) {
Victor Stinner42885202011-11-22 01:23:02 +01004394 if (consumed)
4395 *consumed = size;
4396
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004397 if (size == 1)
4398 return get_latin1_char((unsigned char)s[0]);
4399
4400 unicode = PyUnicode_New(unicode_size, maxchar);
4401 if (!unicode)
4402 return NULL;
4403 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4404 assert(_PyUnicode_CheckConsistency(unicode, 1));
4405 return unicode;
4406 }
4407
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004408 /* In case of errors, maxchar and size computation might be incorrect;
4409 code below refits and resizes as necessary. */
4410 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004411 if (!unicode)
4412 return NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004413 kind = PyUnicode_KIND(unicode);
4414 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004415
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004417 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004419 switch (kind) {
4420 case PyUnicode_1BYTE_KIND:
4421 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4422 break;
4423 case PyUnicode_2BYTE_KIND:
4424 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4425 break;
4426 case PyUnicode_4BYTE_KIND:
4427 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4428 break;
4429 }
4430 if (!has_errors) {
4431 /* Ensure the unicode size calculation was correct */
4432 assert(i == unicode_size);
4433 assert(s == e);
4434 if (consumed)
4435 *consumed = s-starts;
4436 return unicode;
4437 }
4438 /* Fall through to the generic decoding loop for the rest of
4439 the string */
4440 if (refit_partial_string(&unicode, kind, data, i) < 0)
4441 goto onError;
4442
Antoine Pitrouab868312009-01-10 15:40:25 +00004443 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444
4445 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004446 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004447
4448 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004449 /* Fast path for runs of ASCII characters. Given that common UTF-8
4450 input will consist of an overwhelming majority of ASCII
4451 characters, we try to optimize for this case by checking
4452 as many characters as a C 'long' can contain.
4453 First, check if we can do an aligned read, as most CPUs have
4454 a penalty for unaligned reads.
4455 */
4456 if (!((size_t) s & LONG_PTR_MASK)) {
4457 /* Help register allocation */
4458 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004459 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004460 while (_s < aligned_end) {
4461 /* Read a whole long at a time (either 4 or 8 bytes),
4462 and do a fast unrolled copy if it only contains ASCII
4463 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004464 unsigned long value = *(unsigned long *) _s;
4465 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004466 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004467 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4468 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4469 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4470 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004471#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004472 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4473 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4474 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4475 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004476#endif
4477 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004479 }
4480 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004481 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004482 if (s == e)
4483 break;
4484 ch = (unsigned char)*s;
4485 }
4486 }
4487
4488 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004489 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 s++;
4491 continue;
4492 }
4493
4494 n = utf8_code_length[ch];
4495
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004496 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 if (consumed)
4498 break;
4499 else {
4500 errmsg = "unexpected end of data";
4501 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004502 endinpos = startinpos+1;
4503 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4504 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004505 goto utf8Error;
4506 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004507 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508
4509 switch (n) {
4510
4511 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004512 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 startinpos = s-starts;
4514 endinpos = startinpos+1;
4515 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516
4517 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004518 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 startinpos = s-starts;
4520 endinpos = startinpos+1;
4521 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522
4523 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004524 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004525 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004526 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004527 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 goto utf8Error;
4529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004531 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004532 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004533 break;
4534
4535 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004536 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4537 will result in surrogates in range d800-dfff. Surrogates are
4538 not valid UTF-8 so they are rejected.
4539 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4540 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004541 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004542 (s[2] & 0xc0) != 0x80 ||
4543 ((unsigned char)s[0] == 0xE0 &&
4544 (unsigned char)s[1] < 0xA0) ||
4545 ((unsigned char)s[0] == 0xED &&
4546 (unsigned char)s[1] > 0x9F)) {
4547 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004549 endinpos = startinpos + 1;
4550
4551 /* if s[1] first two bits are 1 and 0, then the invalid
4552 continuation byte is s[2], so increment endinpos by 1,
4553 if not, s[1] is invalid and endinpos doesn't need to
4554 be incremented. */
4555 if ((s[1] & 0xC0) == 0x80)
4556 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004557 goto utf8Error;
4558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004560 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004561 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004562 break;
4563
4564 case 4:
4565 if ((s[1] & 0xc0) != 0x80 ||
4566 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004567 (s[3] & 0xc0) != 0x80 ||
4568 ((unsigned char)s[0] == 0xF0 &&
4569 (unsigned char)s[1] < 0x90) ||
4570 ((unsigned char)s[0] == 0xF4 &&
4571 (unsigned char)s[1] > 0x8F)) {
4572 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004573 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004574 endinpos = startinpos + 1;
4575 if ((s[1] & 0xC0) == 0x80) {
4576 endinpos++;
4577 if ((s[2] & 0xC0) == 0x80)
4578 endinpos++;
4579 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004580 goto utf8Error;
4581 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004582 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004583 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4584 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4585
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004586 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 }
4589 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004590 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004591
Benjamin Peterson29060642009-01-31 22:14:21 +00004592 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004593 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004594 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004595 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004596 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004597 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004598 if (unicode_decode_call_errorhandler(
4599 errors, &errorHandler,
4600 "utf8", errmsg,
4601 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004602 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004603 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004604 /* Update data because unicode_decode_call_errorhandler might have
4605 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004606 data = PyUnicode_DATA(unicode);
4607 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004608 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004610 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004611 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004612
Walter Dörwald69652032004-09-07 20:24:22 +00004613 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004614 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616 /* Adjust length and ready string when it contained errors and
4617 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004618 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004619 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004620 goto onError;
4621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004623 Py_XDECREF(errorHandler);
4624 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004625 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004626 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627
Benjamin Peterson29060642009-01-31 22:14:21 +00004628 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004629 Py_XDECREF(errorHandler);
4630 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631 Py_DECREF(unicode);
4632 return NULL;
4633}
4634
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004635#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004636
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004637#ifdef __APPLE__
4638
4639/* Simplified UTF-8 decoder using surrogateescape error handler,
4640 used to decode the command line arguments on Mac OS X. */
4641
4642wchar_t*
4643_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4644{
4645 int n;
4646 const char *e;
4647 wchar_t *unicode, *p;
4648
4649 /* Note: size will always be longer than the resulting Unicode
4650 character count */
4651 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4652 PyErr_NoMemory();
4653 return NULL;
4654 }
4655 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4656 if (!unicode)
4657 return NULL;
4658
4659 /* Unpack UTF-8 encoded data */
4660 p = unicode;
4661 e = s + size;
4662 while (s < e) {
4663 Py_UCS4 ch = (unsigned char)*s;
4664
4665 if (ch < 0x80) {
4666 *p++ = (wchar_t)ch;
4667 s++;
4668 continue;
4669 }
4670
4671 n = utf8_code_length[ch];
4672 if (s + n > e) {
4673 goto surrogateescape;
4674 }
4675
4676 switch (n) {
4677 case 0:
4678 case 1:
4679 goto surrogateescape;
4680
4681 case 2:
4682 if ((s[1] & 0xc0) != 0x80)
4683 goto surrogateescape;
4684 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4685 assert ((ch > 0x007F) && (ch <= 0x07FF));
4686 *p++ = (wchar_t)ch;
4687 break;
4688
4689 case 3:
4690 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4691 will result in surrogates in range d800-dfff. Surrogates are
4692 not valid UTF-8 so they are rejected.
4693 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4694 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4695 if ((s[1] & 0xc0) != 0x80 ||
4696 (s[2] & 0xc0) != 0x80 ||
4697 ((unsigned char)s[0] == 0xE0 &&
4698 (unsigned char)s[1] < 0xA0) ||
4699 ((unsigned char)s[0] == 0xED &&
4700 (unsigned char)s[1] > 0x9F)) {
4701
4702 goto surrogateescape;
4703 }
4704 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4705 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004706 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004707 break;
4708
4709 case 4:
4710 if ((s[1] & 0xc0) != 0x80 ||
4711 (s[2] & 0xc0) != 0x80 ||
4712 (s[3] & 0xc0) != 0x80 ||
4713 ((unsigned char)s[0] == 0xF0 &&
4714 (unsigned char)s[1] < 0x90) ||
4715 ((unsigned char)s[0] == 0xF4 &&
4716 (unsigned char)s[1] > 0x8F)) {
4717 goto surrogateescape;
4718 }
4719 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4720 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4721 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4722
4723#if SIZEOF_WCHAR_T == 4
4724 *p++ = (wchar_t)ch;
4725#else
4726 /* compute and append the two surrogates: */
4727
4728 /* translate from 10000..10FFFF to 0..FFFF */
4729 ch -= 0x10000;
4730
4731 /* high surrogate = top 10 bits added to D800 */
4732 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4733
4734 /* low surrogate = bottom 10 bits added to DC00 */
4735 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4736#endif
4737 break;
4738 }
4739 s += n;
4740 continue;
4741
4742 surrogateescape:
4743 *p++ = 0xDC00 + ch;
4744 s++;
4745 }
4746 *p = L'\0';
4747 return unicode;
4748}
4749
4750#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004752/* Primary internal function which creates utf8 encoded bytes objects.
4753
4754 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004755 and allocate exactly as much space needed at the end. Else allocate the
4756 maximum possible needed (4 result bytes per Unicode character), and return
4757 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004758*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004759PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004760_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761{
Tim Peters602f7402002-04-27 18:03:26 +00004762#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004763
Guido van Rossum98297ee2007-11-06 21:34:58 +00004764 Py_ssize_t i; /* index into s of next input byte */
4765 PyObject *result; /* result string object */
4766 char *p; /* next free byte in output buffer */
4767 Py_ssize_t nallocated; /* number of result bytes allocated */
4768 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004769 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004770 PyObject *errorHandler = NULL;
4771 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004772 int kind;
4773 void *data;
4774 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004775 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 if (!PyUnicode_Check(unicode)) {
4778 PyErr_BadArgument();
4779 return NULL;
4780 }
4781
4782 if (PyUnicode_READY(unicode) == -1)
4783 return NULL;
4784
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004785 if (PyUnicode_UTF8(unicode))
4786 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4787 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788
4789 kind = PyUnicode_KIND(unicode);
4790 data = PyUnicode_DATA(unicode);
4791 size = PyUnicode_GET_LENGTH(unicode);
4792
Tim Peters602f7402002-04-27 18:03:26 +00004793 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794
Tim Peters602f7402002-04-27 18:03:26 +00004795 if (size <= MAX_SHORT_UNICHARS) {
4796 /* Write into the stack buffer; nallocated can't overflow.
4797 * At the end, we'll allocate exactly as much heap space as it
4798 * turns out we need.
4799 */
4800 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004801 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004802 p = stackbuf;
4803 }
4804 else {
4805 /* Overallocate on the heap, and give the excess back at the end. */
4806 nallocated = size * 4;
4807 if (nallocated / 4 != size) /* overflow! */
4808 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004809 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004810 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004811 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004812 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004813 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004814
Tim Peters602f7402002-04-27 18:03:26 +00004815 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004816 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004817
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004818 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004819 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004821
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004823 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004824 *p++ = (char)(0xc0 | (ch >> 6));
4825 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004826 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004827 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828 Py_ssize_t repsize, k, startpos;
4829 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004830 rep = unicode_encode_call_errorhandler(
4831 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004832 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004833 if (!rep)
4834 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004836 if (PyBytes_Check(rep))
4837 repsize = PyBytes_GET_SIZE(rep);
4838 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004839 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004840
4841 if (repsize > 4) {
4842 Py_ssize_t offset;
4843
4844 if (result == NULL)
4845 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004846 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004847 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004849 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4850 /* integer overflow */
4851 PyErr_NoMemory();
4852 goto error;
4853 }
4854 nallocated += repsize - 4;
4855 if (result != NULL) {
4856 if (_PyBytes_Resize(&result, nallocated) < 0)
4857 goto error;
4858 } else {
4859 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004860 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004861 goto error;
4862 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4863 }
4864 p = PyBytes_AS_STRING(result) + offset;
4865 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004867 if (PyBytes_Check(rep)) {
4868 char *prep = PyBytes_AS_STRING(rep);
4869 for(k = repsize; k > 0; k--)
4870 *p++ = *prep++;
4871 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004872 enum PyUnicode_Kind repkind;
4873 void *repdata;
4874
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004875 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004876 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004877 repkind = PyUnicode_KIND(rep);
4878 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879
4880 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004881 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004882 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004883 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004884 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004885 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004886 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004887 goto error;
4888 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004889 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004890 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004891 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004892 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004893 } else if (ch < 0x10000) {
4894 *p++ = (char)(0xe0 | (ch >> 12));
4895 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4896 *p++ = (char)(0x80 | (ch & 0x3f));
4897 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004898 /* Encode UCS4 Unicode ordinals */
4899 *p++ = (char)(0xf0 | (ch >> 18));
4900 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4901 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4902 *p++ = (char)(0x80 | (ch & 0x3f));
4903 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004905
Guido van Rossum98297ee2007-11-06 21:34:58 +00004906 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004907 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004908 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004909 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004910 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004911 }
4912 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004913 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004914 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004915 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004916 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004917 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004918
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004919 Py_XDECREF(errorHandler);
4920 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004921 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004922 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004923 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004924 Py_XDECREF(errorHandler);
4925 Py_XDECREF(exc);
4926 Py_XDECREF(result);
4927 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004928
Tim Peters602f7402002-04-27 18:03:26 +00004929#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930}
4931
Alexander Belopolsky40018472011-02-26 01:02:56 +00004932PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004933PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4934 Py_ssize_t size,
4935 const char *errors)
4936{
4937 PyObject *v, *unicode;
4938
4939 unicode = PyUnicode_FromUnicode(s, size);
4940 if (unicode == NULL)
4941 return NULL;
4942 v = _PyUnicode_AsUTF8String(unicode, errors);
4943 Py_DECREF(unicode);
4944 return v;
4945}
4946
4947PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004948PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004950 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951}
4952
Walter Dörwald41980ca2007-08-16 21:55:45 +00004953/* --- UTF-32 Codec ------------------------------------------------------- */
4954
4955PyObject *
4956PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004957 Py_ssize_t size,
4958 const char *errors,
4959 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004960{
4961 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4962}
4963
4964PyObject *
4965PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 Py_ssize_t size,
4967 const char *errors,
4968 int *byteorder,
4969 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004970{
4971 const char *starts = s;
4972 Py_ssize_t startinpos;
4973 Py_ssize_t endinpos;
4974 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004975 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004976 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977 int bo = 0; /* assume native ordering by default */
4978 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004979 /* Offsets from q for retrieving bytes in the right order. */
4980#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4981 int iorder[] = {0, 1, 2, 3};
4982#else
4983 int iorder[] = {3, 2, 1, 0};
4984#endif
4985 PyObject *errorHandler = NULL;
4986 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004987
Walter Dörwald41980ca2007-08-16 21:55:45 +00004988 q = (unsigned char *)s;
4989 e = q + size;
4990
4991 if (byteorder)
4992 bo = *byteorder;
4993
4994 /* Check for BOM marks (U+FEFF) in the input and adjust current
4995 byte order setting accordingly. In native mode, the leading BOM
4996 mark is skipped, in all other modes, it is copied to the output
4997 stream as-is (giving a ZWNBSP character). */
4998 if (bo == 0) {
4999 if (size >= 4) {
5000 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005001 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005002#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005003 if (bom == 0x0000FEFF) {
5004 q += 4;
5005 bo = -1;
5006 }
5007 else if (bom == 0xFFFE0000) {
5008 q += 4;
5009 bo = 1;
5010 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005011#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005012 if (bom == 0x0000FEFF) {
5013 q += 4;
5014 bo = 1;
5015 }
5016 else if (bom == 0xFFFE0000) {
5017 q += 4;
5018 bo = -1;
5019 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005020#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005022 }
5023
5024 if (bo == -1) {
5025 /* force LE */
5026 iorder[0] = 0;
5027 iorder[1] = 1;
5028 iorder[2] = 2;
5029 iorder[3] = 3;
5030 }
5031 else if (bo == 1) {
5032 /* force BE */
5033 iorder[0] = 3;
5034 iorder[1] = 2;
5035 iorder[2] = 1;
5036 iorder[3] = 0;
5037 }
5038
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005039 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005040 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005041 if (!unicode)
5042 return NULL;
5043 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005044 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005045 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005046
Walter Dörwald41980ca2007-08-16 21:55:45 +00005047 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 Py_UCS4 ch;
5049 /* remaining bytes at the end? (size should be divisible by 4) */
5050 if (e-q<4) {
5051 if (consumed)
5052 break;
5053 errmsg = "truncated data";
5054 startinpos = ((const char *)q)-starts;
5055 endinpos = ((const char *)e)-starts;
5056 goto utf32Error;
5057 /* The remaining input chars are ignored if the callback
5058 chooses to skip the input */
5059 }
5060 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5061 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005062
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 if (ch >= 0x110000)
5064 {
5065 errmsg = "codepoint not in range(0x110000)";
5066 startinpos = ((const char *)q)-starts;
5067 endinpos = startinpos+4;
5068 goto utf32Error;
5069 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005070 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5071 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 q += 4;
5073 continue;
5074 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 if (unicode_decode_call_errorhandler(
5076 errors, &errorHandler,
5077 "utf32", errmsg,
5078 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005079 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005080 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081 }
5082
5083 if (byteorder)
5084 *byteorder = bo;
5085
5086 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005087 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005088
5089 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005090 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091 goto onError;
5092
5093 Py_XDECREF(errorHandler);
5094 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005095 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 Py_DECREF(unicode);
5099 Py_XDECREF(errorHandler);
5100 Py_XDECREF(exc);
5101 return NULL;
5102}
5103
5104PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005105_PyUnicode_EncodeUTF32(PyObject *str,
5106 const char *errors,
5107 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005109 int kind;
5110 void *data;
5111 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005112 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005114 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005115 /* Offsets from p for storing byte pairs in the right order. */
5116#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5117 int iorder[] = {0, 1, 2, 3};
5118#else
5119 int iorder[] = {3, 2, 1, 0};
5120#endif
5121
Benjamin Peterson29060642009-01-31 22:14:21 +00005122#define STORECHAR(CH) \
5123 do { \
5124 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5125 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5126 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5127 p[iorder[0]] = (CH) & 0xff; \
5128 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005129 } while(0)
5130
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005131 if (!PyUnicode_Check(str)) {
5132 PyErr_BadArgument();
5133 return NULL;
5134 }
5135 if (PyUnicode_READY(str) < 0)
5136 return NULL;
5137 kind = PyUnicode_KIND(str);
5138 data = PyUnicode_DATA(str);
5139 len = PyUnicode_GET_LENGTH(str);
5140
5141 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005142 bytesize = nsize * 4;
5143 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005144 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005145 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005146 if (v == NULL)
5147 return NULL;
5148
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005149 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005150 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005151 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005152 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005153 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005154
5155 if (byteorder == -1) {
5156 /* force LE */
5157 iorder[0] = 0;
5158 iorder[1] = 1;
5159 iorder[2] = 2;
5160 iorder[3] = 3;
5161 }
5162 else if (byteorder == 1) {
5163 /* force BE */
5164 iorder[0] = 3;
5165 iorder[1] = 2;
5166 iorder[2] = 1;
5167 iorder[3] = 0;
5168 }
5169
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005170 for (i = 0; i < len; i++)
5171 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005172
5173 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005174 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005175#undef STORECHAR
5176}
5177
Alexander Belopolsky40018472011-02-26 01:02:56 +00005178PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005179PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5180 Py_ssize_t size,
5181 const char *errors,
5182 int byteorder)
5183{
5184 PyObject *result;
5185 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5186 if (tmp == NULL)
5187 return NULL;
5188 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5189 Py_DECREF(tmp);
5190 return result;
5191}
5192
5193PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005194PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005195{
Victor Stinnerb960b342011-11-20 19:12:52 +01005196 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005197}
5198
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199/* --- UTF-16 Codec ------------------------------------------------------- */
5200
Tim Peters772747b2001-08-09 22:21:55 +00005201PyObject *
5202PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005203 Py_ssize_t size,
5204 const char *errors,
5205 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206{
Walter Dörwald69652032004-09-07 20:24:22 +00005207 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5208}
5209
Antoine Pitrouab868312009-01-10 15:40:25 +00005210/* Two masks for fast checking of whether a C 'long' may contain
5211 UTF16-encoded surrogate characters. This is an efficient heuristic,
5212 assuming that non-surrogate characters with a code point >= 0x8000 are
5213 rare in most input.
5214 FAST_CHAR_MASK is used when the input is in native byte ordering,
5215 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005216*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005217#if (SIZEOF_LONG == 8)
5218# define FAST_CHAR_MASK 0x8000800080008000L
5219# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5220#elif (SIZEOF_LONG == 4)
5221# define FAST_CHAR_MASK 0x80008000L
5222# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5223#else
5224# error C 'long' size should be either 4 or 8!
5225#endif
5226
Walter Dörwald69652032004-09-07 20:24:22 +00005227PyObject *
5228PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 Py_ssize_t size,
5230 const char *errors,
5231 int *byteorder,
5232 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005233{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t startinpos;
5236 Py_ssize_t endinpos;
5237 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005238 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005239 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005240 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005241 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005242 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005243 /* Offsets from q for retrieving byte pairs in the right order. */
5244#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5245 int ihi = 1, ilo = 0;
5246#else
5247 int ihi = 0, ilo = 1;
5248#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005249 PyObject *errorHandler = NULL;
5250 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251
5252 /* Note: size will always be longer than the resulting Unicode
5253 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005254 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255 if (!unicode)
5256 return NULL;
5257 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005258 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005259 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
Tim Peters772747b2001-08-09 22:21:55 +00005261 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005262 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263
5264 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005265 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005267 /* Check for BOM marks (U+FEFF) in the input and adjust current
5268 byte order setting accordingly. In native mode, the leading BOM
5269 mark is skipped, in all other modes, it is copied to the output
5270 stream as-is (giving a ZWNBSP character). */
5271 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005272 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005273 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005274#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005275 if (bom == 0xFEFF) {
5276 q += 2;
5277 bo = -1;
5278 }
5279 else if (bom == 0xFFFE) {
5280 q += 2;
5281 bo = 1;
5282 }
Tim Petersced69f82003-09-16 20:30:58 +00005283#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005284 if (bom == 0xFEFF) {
5285 q += 2;
5286 bo = 1;
5287 }
5288 else if (bom == 0xFFFE) {
5289 q += 2;
5290 bo = -1;
5291 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005292#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295
Tim Peters772747b2001-08-09 22:21:55 +00005296 if (bo == -1) {
5297 /* force LE */
5298 ihi = 1;
5299 ilo = 0;
5300 }
5301 else if (bo == 1) {
5302 /* force BE */
5303 ihi = 0;
5304 ilo = 1;
5305 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005306#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5307 native_ordering = ilo < ihi;
5308#else
5309 native_ordering = ilo > ihi;
5310#endif
Tim Peters772747b2001-08-09 22:21:55 +00005311
Antoine Pitrouab868312009-01-10 15:40:25 +00005312 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005313 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005314 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005315 /* First check for possible aligned read of a C 'long'. Unaligned
5316 reads are more expensive, better to defer to another iteration. */
5317 if (!((size_t) q & LONG_PTR_MASK)) {
5318 /* Fast path for runs of non-surrogate chars. */
5319 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005320 int kind = PyUnicode_KIND(unicode);
5321 void *data = PyUnicode_DATA(unicode);
5322 while (_q < aligned_end) {
5323 unsigned long block = * (unsigned long *) _q;
5324 unsigned short *pblock = (unsigned short*)&block;
5325 Py_UCS4 maxch;
5326 if (native_ordering) {
5327 /* Can use buffer directly */
5328 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005329 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005330 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005331 else {
5332 /* Need to byte-swap */
5333 unsigned char *_p = (unsigned char*)pblock;
5334 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005335 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005336 _p[0] = _q[1];
5337 _p[1] = _q[0];
5338 _p[2] = _q[3];
5339 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005340#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005341 _p[4] = _q[5];
5342 _p[5] = _q[4];
5343 _p[6] = _q[7];
5344 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005345#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005346 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005347 maxch = Py_MAX(pblock[0], pblock[1]);
5348#if SIZEOF_LONG == 8
5349 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5350#endif
5351 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5352 if (unicode_widen(&unicode, maxch) < 0)
5353 goto onError;
5354 kind = PyUnicode_KIND(unicode);
5355 data = PyUnicode_DATA(unicode);
5356 }
5357 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5358 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5359#if SIZEOF_LONG == 8
5360 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5361 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5362#endif
5363 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005364 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005365 q = _q;
5366 if (q >= e)
5367 break;
5368 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005370
Benjamin Peterson14339b62009-01-31 16:36:08 +00005371 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005372
5373 if (ch < 0xD800 || ch > 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005374 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5375 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 continue;
5377 }
5378
5379 /* UTF-16 code pair: */
5380 if (q > e) {
5381 errmsg = "unexpected end of data";
5382 startinpos = (((const char *)q) - 2) - starts;
5383 endinpos = ((const char *)e) + 1 - starts;
5384 goto utf16Error;
5385 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005386 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5387 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005389 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005390 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005391 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005392 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 continue;
5394 }
5395 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005396 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 startinpos = (((const char *)q)-4)-starts;
5398 endinpos = startinpos+2;
5399 goto utf16Error;
5400 }
5401
Benjamin Peterson14339b62009-01-31 16:36:08 +00005402 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005403 errmsg = "illegal encoding";
5404 startinpos = (((const char *)q)-2)-starts;
5405 endinpos = startinpos+2;
5406 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005407
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005410 errors,
5411 &errorHandler,
5412 "utf16", errmsg,
5413 &starts,
5414 (const char **)&e,
5415 &startinpos,
5416 &endinpos,
5417 &exc,
5418 (const char **)&q,
5419 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005420 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005421 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005423 /* remaining byte at the end? (size should be even) */
5424 if (e == q) {
5425 if (!consumed) {
5426 errmsg = "truncated data";
5427 startinpos = ((const char *)q) - starts;
5428 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005429 if (unicode_decode_call_errorhandler(
5430 errors,
5431 &errorHandler,
5432 "utf16", errmsg,
5433 &starts,
5434 (const char **)&e,
5435 &startinpos,
5436 &endinpos,
5437 &exc,
5438 (const char **)&q,
5439 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005440 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005441 goto onError;
5442 /* The remaining input chars are ignored if the callback
5443 chooses to skip the input */
5444 }
5445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446
5447 if (byteorder)
5448 *byteorder = bo;
5449
Walter Dörwald69652032004-09-07 20:24:22 +00005450 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005452
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005454 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 goto onError;
5456
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 Py_XDECREF(errorHandler);
5458 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005459 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460
Benjamin Peterson29060642009-01-31 22:14:21 +00005461 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005463 Py_XDECREF(errorHandler);
5464 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 return NULL;
5466}
5467
Antoine Pitrouab868312009-01-10 15:40:25 +00005468#undef FAST_CHAR_MASK
5469#undef SWAPPED_FAST_CHAR_MASK
5470
Tim Peters772747b2001-08-09 22:21:55 +00005471PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005472_PyUnicode_EncodeUTF16(PyObject *str,
5473 const char *errors,
5474 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005476 int kind;
5477 void *data;
5478 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005479 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005480 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005481 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005482 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005483 /* Offsets from p for storing byte pairs in the right order. */
5484#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5485 int ihi = 1, ilo = 0;
5486#else
5487 int ihi = 0, ilo = 1;
5488#endif
5489
Benjamin Peterson29060642009-01-31 22:14:21 +00005490#define STORECHAR(CH) \
5491 do { \
5492 p[ihi] = ((CH) >> 8) & 0xff; \
5493 p[ilo] = (CH) & 0xff; \
5494 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005495 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005496
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005497 if (!PyUnicode_Check(str)) {
5498 PyErr_BadArgument();
5499 return NULL;
5500 }
5501 if (PyUnicode_READY(str) < 0)
5502 return NULL;
5503 kind = PyUnicode_KIND(str);
5504 data = PyUnicode_DATA(str);
5505 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005506
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005507 pairs = 0;
5508 if (kind == PyUnicode_4BYTE_KIND)
5509 for (i = 0; i < len; i++)
5510 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5511 pairs++;
5512 /* 2 * (len + pairs + (byteorder == 0)) */
5513 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005515 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005516 bytesize = nsize * 2;
5517 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005519 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 if (v == NULL)
5521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005523 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005526 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005527 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005528
5529 if (byteorder == -1) {
5530 /* force LE */
5531 ihi = 1;
5532 ilo = 0;
5533 }
5534 else if (byteorder == 1) {
5535 /* force BE */
5536 ihi = 0;
5537 ilo = 1;
5538 }
5539
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005540 for (i = 0; i < len; i++) {
5541 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5542 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 if (ch >= 0x10000) {
5544 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5545 ch = 0xD800 | ((ch-0x10000) >> 10);
5546 }
Tim Peters772747b2001-08-09 22:21:55 +00005547 STORECHAR(ch);
5548 if (ch2)
5549 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005550 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005551
5552 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005553 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005554#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555}
5556
Alexander Belopolsky40018472011-02-26 01:02:56 +00005557PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005558PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5559 Py_ssize_t size,
5560 const char *errors,
5561 int byteorder)
5562{
5563 PyObject *result;
5564 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5565 if (tmp == NULL)
5566 return NULL;
5567 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5568 Py_DECREF(tmp);
5569 return result;
5570}
5571
5572PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005573PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005575 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576}
5577
5578/* --- Unicode Escape Codec ----------------------------------------------- */
5579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005580/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5581 if all the escapes in the string make it still a valid ASCII string.
5582 Returns -1 if any escapes were found which cause the string to
5583 pop out of ASCII range. Otherwise returns the length of the
5584 required buffer to hold the string.
5585 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005586static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005587length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5588{
5589 const unsigned char *p = (const unsigned char *)s;
5590 const unsigned char *end = p + size;
5591 Py_ssize_t length = 0;
5592
5593 if (size < 0)
5594 return -1;
5595
5596 for (; p < end; ++p) {
5597 if (*p > 127) {
5598 /* Non-ASCII */
5599 return -1;
5600 }
5601 else if (*p != '\\') {
5602 /* Normal character */
5603 ++length;
5604 }
5605 else {
5606 /* Backslash-escape, check next char */
5607 ++p;
5608 /* Escape sequence reaches till end of string or
5609 non-ASCII follow-up. */
5610 if (p >= end || *p > 127)
5611 return -1;
5612 switch (*p) {
5613 case '\n':
5614 /* backslash + \n result in zero characters */
5615 break;
5616 case '\\': case '\'': case '\"':
5617 case 'b': case 'f': case 't':
5618 case 'n': case 'r': case 'v': case 'a':
5619 ++length;
5620 break;
5621 case '0': case '1': case '2': case '3':
5622 case '4': case '5': case '6': case '7':
5623 case 'x': case 'u': case 'U': case 'N':
5624 /* these do not guarantee ASCII characters */
5625 return -1;
5626 default:
5627 /* count the backslash + the other character */
5628 length += 2;
5629 }
5630 }
5631 }
5632 return length;
5633}
5634
Fredrik Lundh06d12682001-01-24 07:59:11 +00005635static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005636
Alexander Belopolsky40018472011-02-26 01:02:56 +00005637PyObject *
5638PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005639 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005640 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005642 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005643 Py_ssize_t startinpos;
5644 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005645 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005646 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005648 char* message;
5649 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005650 PyObject *errorHandler = NULL;
5651 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005652 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005653 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005655 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656
5657 /* After length_of_escaped_ascii_string() there are two alternatives,
5658 either the string is pure ASCII with named escapes like \n, etc.
5659 and we determined it's exact size (common case)
5660 or it contains \x, \u, ... escape sequences. then we create a
5661 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005662 if (len >= 0) {
5663 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005664 if (!v)
5665 goto onError;
5666 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005667 }
5668 else {
5669 /* Escaped strings will always be longer than the resulting
5670 Unicode string, so we start with size here and then reduce the
5671 length after conversion to the true value.
5672 (but if the error callback returns a long replacement string
5673 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005674 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675 if (!v)
5676 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005677 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678 }
5679
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005681 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005682 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005684
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 while (s < end) {
5686 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005687 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005688 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005690 /* The only case in which i == ascii_length is a backslash
5691 followed by a newline. */
5692 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005693
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 /* Non-escape characters are interpreted as Unicode ordinals */
5695 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005696 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 continue;
5699 }
5700
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005701 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 /* \ - Escapes */
5703 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005704 c = *s++;
5705 if (s > end)
5706 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005707
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005708 /* The only case in which i == ascii_length is a backslash
5709 followed by a newline. */
5710 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005711
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005712 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005715#define WRITECHAR(ch) \
5716 do { \
5717 if (unicode_putchar(&v, &i, ch) < 0) \
5718 goto onError; \
5719 }while(0)
5720
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005722 case '\\': WRITECHAR('\\'); break;
5723 case '\'': WRITECHAR('\''); break;
5724 case '\"': WRITECHAR('\"'); break;
5725 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005726 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005727 case 'f': WRITECHAR('\014'); break;
5728 case 't': WRITECHAR('\t'); break;
5729 case 'n': WRITECHAR('\n'); break;
5730 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005731 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005732 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005733 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005734 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 case '0': case '1': case '2': case '3':
5738 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005739 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005740 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005741 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005742 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005743 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005745 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 break;
5747
Benjamin Peterson29060642009-01-31 22:14:21 +00005748 /* hex escapes */
5749 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005751 digits = 2;
5752 message = "truncated \\xXX escape";
5753 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005757 digits = 4;
5758 message = "truncated \\uXXXX escape";
5759 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005762 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005763 digits = 8;
5764 message = "truncated \\UXXXXXXXX escape";
5765 hexescape:
5766 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 if (s+digits>end) {
5768 endinpos = size;
5769 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005770 errors, &errorHandler,
5771 "unicodeescape", "end of string in escape sequence",
5772 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005773 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 goto onError;
5775 goto nextByte;
5776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005777 for (j = 0; j < digits; ++j) {
5778 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005779 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005780 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 errors, &errorHandler,
5783 "unicodeescape", message,
5784 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005785 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005786 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005787 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005788 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005789 }
5790 chr = (chr<<4) & ~0xF;
5791 if (c >= '0' && c <= '9')
5792 chr += c - '0';
5793 else if (c >= 'a' && c <= 'f')
5794 chr += 10 + c - 'a';
5795 else
5796 chr += 10 + c - 'A';
5797 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005798 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005799 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 /* _decoding_error will have already written into the
5801 target buffer. */
5802 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005803 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005804 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005805 if (chr <= 0x10ffff) {
5806 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005807 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005808 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 errors, &errorHandler,
5811 "unicodeescape", "illegal Unicode character",
5812 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005813 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005814 goto onError;
5815 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005816 break;
5817
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005819 case 'N':
5820 message = "malformed \\N character escape";
5821 if (ucnhash_CAPI == NULL) {
5822 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005823 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5824 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005825 if (ucnhash_CAPI == NULL)
5826 goto ucnhashError;
5827 }
5828 if (*s == '{') {
5829 const char *start = s+1;
5830 /* look for the closing brace */
5831 while (*s != '}' && s < end)
5832 s++;
5833 if (s > start && s < end && *s == '}') {
5834 /* found a name. look it up in the unicode database */
5835 message = "unknown Unicode character name";
5836 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005838 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 goto store;
5840 }
5841 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005842 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005843 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 errors, &errorHandler,
5845 "unicodeescape", message,
5846 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005847 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005848 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005849 break;
5850
5851 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005852 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 message = "\\ at end of string";
5854 s--;
5855 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 errors, &errorHandler,
5858 "unicodeescape", message,
5859 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005860 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005861 goto onError;
5862 }
5863 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005864 WRITECHAR('\\');
5865 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005866 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005867 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005872#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005873
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005874 if (PyUnicode_Resize(&v, i) < 0)
5875 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005876 Py_XDECREF(errorHandler);
5877 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005878 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005879
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005881 PyErr_SetString(
5882 PyExc_UnicodeError,
5883 "\\N escapes not supported (can't load unicodedata module)"
5884 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005885 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005886 Py_XDECREF(errorHandler);
5887 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005888 return NULL;
5889
Benjamin Peterson29060642009-01-31 22:14:21 +00005890 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005892 Py_XDECREF(errorHandler);
5893 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 return NULL;
5895}
5896
5897/* Return a Unicode-Escape string version of the Unicode object.
5898
5899 If quotes is true, the string is enclosed in u"" or u'' quotes as
5900 appropriate.
5901
5902*/
5903
Alexander Belopolsky40018472011-02-26 01:02:56 +00005904PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005905PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005907 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005908 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005909 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 int kind;
5911 void *data;
5912 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913
Thomas Wouters89f507f2006-12-13 04:49:30 +00005914 /* Initial allocation is based on the longest-possible unichr
5915 escape.
5916
5917 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5918 unichr, so in this case it's the longest unichr escape. In
5919 narrow (UTF-16) builds this is five chars per source unichr
5920 since there are two unichrs in the surrogate pair, so in narrow
5921 (UTF-16) builds it's not the longest unichr escape.
5922
5923 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5924 so in the narrow (UTF-16) build case it's the longest unichr
5925 escape.
5926 */
5927
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005928 if (!PyUnicode_Check(unicode)) {
5929 PyErr_BadArgument();
5930 return NULL;
5931 }
5932 if (PyUnicode_READY(unicode) < 0)
5933 return NULL;
5934 len = PyUnicode_GET_LENGTH(unicode);
5935 kind = PyUnicode_KIND(unicode);
5936 data = PyUnicode_DATA(unicode);
5937 switch(kind) {
5938 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5939 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5940 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5941 }
5942
5943 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005944 return PyBytes_FromStringAndSize(NULL, 0);
5945
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005946 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005948
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005949 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005951 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005952 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 if (repr == NULL)
5954 return NULL;
5955
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005956 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005958 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005959 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005960
Walter Dörwald79e913e2007-05-12 11:08:06 +00005961 /* Escape backslashes */
5962 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 *p++ = '\\';
5964 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005965 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005966 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005967
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005968 /* Map 21-bit characters to '\U00xxxxxx' */
5969 else if (ch >= 0x10000) {
5970 *p++ = '\\';
5971 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005972 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5973 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5974 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5975 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5976 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5977 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5978 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5979 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005981 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005982
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005984 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 *p++ = '\\';
5986 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005987 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5988 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5989 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5990 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005992
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005993 /* Map special whitespace to '\t', \n', '\r' */
5994 else if (ch == '\t') {
5995 *p++ = '\\';
5996 *p++ = 't';
5997 }
5998 else if (ch == '\n') {
5999 *p++ = '\\';
6000 *p++ = 'n';
6001 }
6002 else if (ch == '\r') {
6003 *p++ = '\\';
6004 *p++ = 'r';
6005 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006006
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006007 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006008 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006009 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006010 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006011 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6012 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006013 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006014
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 /* Copy everything else as-is */
6016 else
6017 *p++ = (char) ch;
6018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006020 assert(p - PyBytes_AS_STRING(repr) > 0);
6021 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6022 return NULL;
6023 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024}
6025
Alexander Belopolsky40018472011-02-26 01:02:56 +00006026PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6028 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006030 PyObject *result;
6031 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6032 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 result = PyUnicode_AsUnicodeEscapeString(tmp);
6035 Py_DECREF(tmp);
6036 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037}
6038
6039/* --- Raw Unicode Escape Codec ------------------------------------------- */
6040
Alexander Belopolsky40018472011-02-26 01:02:56 +00006041PyObject *
6042PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006043 Py_ssize_t size,
6044 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006047 Py_ssize_t startinpos;
6048 Py_ssize_t endinpos;
6049 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006050 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 const char *end;
6052 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053 PyObject *errorHandler = NULL;
6054 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006055
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 /* Escaped strings will always be longer than the resulting
6057 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006058 length after conversion to the true value. (But decoding error
6059 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006060 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006064 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006065 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 end = s + size;
6067 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 unsigned char c;
6069 Py_UCS4 x;
6070 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006071 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006072
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 /* Non-escape characters are interpreted as Unicode ordinals */
6074 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006075 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6076 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006078 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006079 startinpos = s-starts;
6080
6081 /* \u-escapes are only interpreted iff the number of leading
6082 backslashes if odd */
6083 bs = s;
6084 for (;s < end;) {
6085 if (*s != '\\')
6086 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006087 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6088 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 }
6090 if (((s - bs) & 1) == 0 ||
6091 s >= end ||
6092 (*s != 'u' && *s != 'U')) {
6093 continue;
6094 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006095 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 count = *s=='u' ? 4 : 8;
6097 s++;
6098
6099 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 for (x = 0, i = 0; i < count; ++i, ++s) {
6101 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006102 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 endinpos = s-starts;
6104 if (unicode_decode_call_errorhandler(
6105 errors, &errorHandler,
6106 "rawunicodeescape", "truncated \\uXXXX",
6107 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006108 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006109 goto onError;
6110 goto nextByte;
6111 }
6112 x = (x<<4) & ~0xF;
6113 if (c >= '0' && c <= '9')
6114 x += c - '0';
6115 else if (c >= 'a' && c <= 'f')
6116 x += 10 + c - 'a';
6117 else
6118 x += 10 + c - 'A';
6119 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006120 if (x <= 0x10ffff) {
6121 if (unicode_putchar(&v, &outpos, x) < 0)
6122 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006123 } else {
6124 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006125 if (unicode_decode_call_errorhandler(
6126 errors, &errorHandler,
6127 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006129 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006131 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 nextByte:
6133 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006135 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006137 Py_XDECREF(errorHandler);
6138 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006139 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006140
Benjamin Peterson29060642009-01-31 22:14:21 +00006141 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006143 Py_XDECREF(errorHandler);
6144 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 return NULL;
6146}
6147
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148
Alexander Belopolsky40018472011-02-26 01:02:56 +00006149PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006150PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006152 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 char *p;
6154 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006155 Py_ssize_t expandsize, pos;
6156 int kind;
6157 void *data;
6158 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006160 if (!PyUnicode_Check(unicode)) {
6161 PyErr_BadArgument();
6162 return NULL;
6163 }
6164 if (PyUnicode_READY(unicode) < 0)
6165 return NULL;
6166 kind = PyUnicode_KIND(unicode);
6167 data = PyUnicode_DATA(unicode);
6168 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006169
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170 switch(kind) {
6171 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6172 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6173 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6174 }
Victor Stinner0e368262011-11-10 20:12:49 +01006175
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006176 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006178
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006179 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 if (repr == NULL)
6181 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006183 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006185 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186 for (pos = 0; pos < len; pos++) {
6187 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 /* Map 32-bit characters to '\Uxxxxxxxx' */
6189 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006190 *p++ = '\\';
6191 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006192 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6193 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6194 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6195 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6196 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6198 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6199 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006200 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006202 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 *p++ = '\\';
6204 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006205 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6206 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6207 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6208 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 /* Copy everything else as-is */
6211 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 *p++ = (char) ch;
6213 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006214
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006215 assert(p > q);
6216 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006217 return NULL;
6218 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219}
6220
Alexander Belopolsky40018472011-02-26 01:02:56 +00006221PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006222PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6223 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006225 PyObject *result;
6226 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6227 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006228 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006229 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6230 Py_DECREF(tmp);
6231 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232}
6233
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006234/* --- Unicode Internal Codec ------------------------------------------- */
6235
Alexander Belopolsky40018472011-02-26 01:02:56 +00006236PyObject *
6237_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006238 Py_ssize_t size,
6239 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006240{
6241 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006242 Py_ssize_t startinpos;
6243 Py_ssize_t endinpos;
6244 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006245 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006246 const char *end;
6247 const char *reason;
6248 PyObject *errorHandler = NULL;
6249 PyObject *exc = NULL;
6250
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006251 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006252 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006253 1))
6254 return NULL;
6255
Thomas Wouters89f507f2006-12-13 04:49:30 +00006256 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006257 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006258 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006260 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006261 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006262 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006263 end = s + size;
6264
6265 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006266 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006267 Py_UCS4 ch;
6268 /* We copy the raw representation one byte at a time because the
6269 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006270 ((char *) &uch)[0] = s[0];
6271 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006272#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006273 ((char *) &uch)[2] = s[2];
6274 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006275#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006276 ch = uch;
6277
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006278 /* We have to sanity check the raw data, otherwise doom looms for
6279 some malformed UCS-4 data. */
6280 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006281#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006282 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006283#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006284 end-s < Py_UNICODE_SIZE
6285 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006287 startinpos = s - starts;
6288 if (end-s < Py_UNICODE_SIZE) {
6289 endinpos = end-starts;
6290 reason = "truncated input";
6291 }
6292 else {
6293 endinpos = s - starts + Py_UNICODE_SIZE;
6294 reason = "illegal code point (> 0x10FFFF)";
6295 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006296 if (unicode_decode_call_errorhandler(
6297 errors, &errorHandler,
6298 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006299 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006300 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006301 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006302 continue;
6303 }
6304
6305 s += Py_UNICODE_SIZE;
6306#ifndef Py_UNICODE_WIDE
6307 if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6308 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006309 Py_UNICODE uch2;
6310 ((char *) &uch2)[0] = s[0];
6311 ((char *) &uch2)[1] = s[1];
6312 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006313 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006314 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006315 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006316 }
6317 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006318#endif
6319
6320 if (unicode_putchar(&v, &outpos, ch) < 0)
6321 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006322 }
6323
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006324 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006325 goto onError;
6326 Py_XDECREF(errorHandler);
6327 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006328 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329
Benjamin Peterson29060642009-01-31 22:14:21 +00006330 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006331 Py_XDECREF(v);
6332 Py_XDECREF(errorHandler);
6333 Py_XDECREF(exc);
6334 return NULL;
6335}
6336
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337/* --- Latin-1 Codec ------------------------------------------------------ */
6338
Alexander Belopolsky40018472011-02-26 01:02:56 +00006339PyObject *
6340PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006341 Py_ssize_t size,
6342 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006345 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346}
6347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006348/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006349static void
6350make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006351 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006352 PyObject *unicode,
6353 Py_ssize_t startpos, Py_ssize_t endpos,
6354 const char *reason)
6355{
6356 if (*exceptionObject == NULL) {
6357 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006358 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006359 encoding, unicode, startpos, endpos, reason);
6360 }
6361 else {
6362 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6363 goto onError;
6364 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6365 goto onError;
6366 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6367 goto onError;
6368 return;
6369 onError:
6370 Py_DECREF(*exceptionObject);
6371 *exceptionObject = NULL;
6372 }
6373}
6374
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006376static void
6377raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006378 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006379 PyObject *unicode,
6380 Py_ssize_t startpos, Py_ssize_t endpos,
6381 const char *reason)
6382{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006383 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006384 encoding, unicode, startpos, endpos, reason);
6385 if (*exceptionObject != NULL)
6386 PyCodec_StrictErrors(*exceptionObject);
6387}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006388
6389/* error handling callback helper:
6390 build arguments, call the callback and check the arguments,
6391 put the result into newpos and return the replacement string, which
6392 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006393static PyObject *
6394unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006395 PyObject **errorHandler,
6396 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006397 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006398 Py_ssize_t startpos, Py_ssize_t endpos,
6399 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006401 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006402 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006403 PyObject *restuple;
6404 PyObject *resunicode;
6405
6406 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006408 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006410 }
6411
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006412 if (PyUnicode_READY(unicode) < 0)
6413 return NULL;
6414 len = PyUnicode_GET_LENGTH(unicode);
6415
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006416 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006417 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420
6421 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006422 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006426 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 Py_DECREF(restuple);
6428 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006429 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006430 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 &resunicode, newpos)) {
6432 Py_DECREF(restuple);
6433 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006435 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6436 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6437 Py_DECREF(restuple);
6438 return NULL;
6439 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006440 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006441 *newpos = len + *newpos;
6442 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6444 Py_DECREF(restuple);
6445 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006446 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006447 Py_INCREF(resunicode);
6448 Py_DECREF(restuple);
6449 return resunicode;
6450}
6451
Alexander Belopolsky40018472011-02-26 01:02:56 +00006452static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006453unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006454 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006455 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006457 /* input state */
6458 Py_ssize_t pos=0, size;
6459 int kind;
6460 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461 /* output object */
6462 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 /* pointer into the output */
6464 char *str;
6465 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006466 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006467 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6468 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006469 PyObject *errorHandler = NULL;
6470 PyObject *exc = NULL;
6471 /* the following variable is used for caching string comparisons
6472 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6473 int known_errorHandler = -1;
6474
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006475 if (PyUnicode_READY(unicode) < 0)
6476 return NULL;
6477 size = PyUnicode_GET_LENGTH(unicode);
6478 kind = PyUnicode_KIND(unicode);
6479 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006480 /* allocate enough for a simple encoding without
6481 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006482 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006483 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006484 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006485 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006486 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006487 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488 ressize = size;
6489
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006490 while (pos < size) {
6491 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006492
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 /* can we encode this? */
6494 if (c<limit) {
6495 /* no overflow check, because we know that the space is enough */
6496 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006497 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006498 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 Py_ssize_t requiredsize;
6501 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006502 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006504 Py_ssize_t collstart = pos;
6505 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 ++collend;
6509 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6510 if (known_errorHandler==-1) {
6511 if ((errors==NULL) || (!strcmp(errors, "strict")))
6512 known_errorHandler = 1;
6513 else if (!strcmp(errors, "replace"))
6514 known_errorHandler = 2;
6515 else if (!strcmp(errors, "ignore"))
6516 known_errorHandler = 3;
6517 else if (!strcmp(errors, "xmlcharrefreplace"))
6518 known_errorHandler = 4;
6519 else
6520 known_errorHandler = 0;
6521 }
6522 switch (known_errorHandler) {
6523 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006524 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006525 goto onError;
6526 case 2: /* replace */
6527 while (collstart++<collend)
6528 *str++ = '?'; /* fall through */
6529 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006531 break;
6532 case 4: /* xmlcharrefreplace */
6533 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006534 /* determine replacement size */
6535 for (i = collstart, repsize = 0; i < collend; ++i) {
6536 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6537 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006538 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006539 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006540 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006541 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006545#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 else
6547 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006548#else
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006549 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006551 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006552 repsize += 2+6+1;
6553 else
6554 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006555#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006557 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 if (requiredsize > ressize) {
6559 if (requiredsize<2*ressize)
6560 requiredsize = 2*ressize;
6561 if (_PyBytes_Resize(&res, requiredsize))
6562 goto onError;
6563 str = PyBytes_AS_STRING(res) + respos;
6564 ressize = requiredsize;
6565 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006566 /* generate replacement */
6567 for (i = collstart; i < collend; ++i) {
6568 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006570 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 break;
6572 default:
6573 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006574 encoding, reason, unicode, &exc,
6575 collstart, collend, &newpos);
6576 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6577 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006579 if (PyBytes_Check(repunicode)) {
6580 /* Directly copy bytes result to output. */
6581 repsize = PyBytes_Size(repunicode);
6582 if (repsize > 1) {
6583 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006584 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006585 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6586 Py_DECREF(repunicode);
6587 goto onError;
6588 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006589 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006590 ressize += repsize-1;
6591 }
6592 memcpy(str, PyBytes_AsString(repunicode), repsize);
6593 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006594 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006595 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006596 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006597 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 /* need more space? (at least enough for what we
6599 have+the replacement+the rest of the string, so
6600 we won't have to check space for encodable characters) */
6601 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006602 repsize = PyUnicode_GET_LENGTH(repunicode);
6603 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 if (requiredsize > ressize) {
6605 if (requiredsize<2*ressize)
6606 requiredsize = 2*ressize;
6607 if (_PyBytes_Resize(&res, requiredsize)) {
6608 Py_DECREF(repunicode);
6609 goto onError;
6610 }
6611 str = PyBytes_AS_STRING(res) + respos;
6612 ressize = requiredsize;
6613 }
6614 /* check if there is anything unencodable in the replacement
6615 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006616 for (i = 0; repsize-->0; ++i, ++str) {
6617 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006619 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 Py_DECREF(repunicode);
6622 goto onError;
6623 }
6624 *str = (char)c;
6625 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006626 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006627 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006629 }
6630 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006631 /* Resize if we allocated to much */
6632 size = str - PyBytes_AS_STRING(res);
6633 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006634 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006635 if (_PyBytes_Resize(&res, size) < 0)
6636 goto onError;
6637 }
6638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006639 Py_XDECREF(errorHandler);
6640 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006641 return res;
6642
6643 onError:
6644 Py_XDECREF(res);
6645 Py_XDECREF(errorHandler);
6646 Py_XDECREF(exc);
6647 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006648}
6649
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006650/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006651PyObject *
6652PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006653 Py_ssize_t size,
6654 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006656 PyObject *result;
6657 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6658 if (unicode == NULL)
6659 return NULL;
6660 result = unicode_encode_ucs1(unicode, errors, 256);
6661 Py_DECREF(unicode);
6662 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663}
6664
Alexander Belopolsky40018472011-02-26 01:02:56 +00006665PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006666_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667{
6668 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006669 PyErr_BadArgument();
6670 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006672 if (PyUnicode_READY(unicode) == -1)
6673 return NULL;
6674 /* Fast path: if it is a one-byte string, construct
6675 bytes object directly. */
6676 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6677 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6678 PyUnicode_GET_LENGTH(unicode));
6679 /* Non-Latin-1 characters present. Defer to above function to
6680 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006681 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006682}
6683
6684PyObject*
6685PyUnicode_AsLatin1String(PyObject *unicode)
6686{
6687 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688}
6689
6690/* --- 7-bit ASCII Codec -------------------------------------------------- */
6691
Alexander Belopolsky40018472011-02-26 01:02:56 +00006692PyObject *
6693PyUnicode_DecodeASCII(const char *s,
6694 Py_ssize_t size,
6695 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006698 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006699 int kind;
6700 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006701 Py_ssize_t startinpos;
6702 Py_ssize_t endinpos;
6703 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006704 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006705 int has_error;
6706 const unsigned char *p = (const unsigned char *)s;
6707 const unsigned char *end = p + size;
6708 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 PyObject *errorHandler = NULL;
6710 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006711
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006712 if (size == 0) {
6713 Py_INCREF(unicode_empty);
6714 return unicode_empty;
6715 }
6716
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006718 if (size == 1 && (unsigned char)s[0] < 128)
6719 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006720
Victor Stinner702c7342011-10-05 13:50:52 +02006721 has_error = 0;
6722 while (p < end && !has_error) {
6723 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6724 an explanation. */
6725 if (!((size_t) p & LONG_PTR_MASK)) {
6726 /* Help register allocation */
6727 register const unsigned char *_p = p;
6728 while (_p < aligned_end) {
6729 unsigned long value = *(unsigned long *) _p;
6730 if (value & ASCII_CHAR_MASK) {
6731 has_error = 1;
6732 break;
6733 }
6734 _p += SIZEOF_LONG;
6735 }
6736 if (_p == end)
6737 break;
6738 if (has_error)
6739 break;
6740 p = _p;
6741 }
6742 if (*p & 0x80) {
6743 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006744 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006745 }
6746 else {
6747 ++p;
6748 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006749 }
Victor Stinner702c7342011-10-05 13:50:52 +02006750 if (!has_error)
6751 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006752
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006753 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006757 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006758 kind = PyUnicode_KIND(v);
6759 data = PyUnicode_DATA(v);
6760 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006761 e = s + size;
6762 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 register unsigned char c = (unsigned char)*s;
6764 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006765 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 ++s;
6767 }
6768 else {
6769 startinpos = s-starts;
6770 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 if (unicode_decode_call_errorhandler(
6772 errors, &errorHandler,
6773 "ascii", "ordinal not in range(128)",
6774 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006775 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006777 kind = PyUnicode_KIND(v);
6778 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006781 if (PyUnicode_Resize(&v, outpos) < 0)
6782 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006783 Py_XDECREF(errorHandler);
6784 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006785 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006786 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006787
Benjamin Peterson29060642009-01-31 22:14:21 +00006788 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006790 Py_XDECREF(errorHandler);
6791 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006792 return NULL;
6793}
6794
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006795/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006796PyObject *
6797PyUnicode_EncodeASCII(const Py_UNICODE *p,
6798 Py_ssize_t size,
6799 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006800{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006801 PyObject *result;
6802 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6803 if (unicode == NULL)
6804 return NULL;
6805 result = unicode_encode_ucs1(unicode, errors, 128);
6806 Py_DECREF(unicode);
6807 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006808}
6809
Alexander Belopolsky40018472011-02-26 01:02:56 +00006810PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006811_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812{
6813 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 PyErr_BadArgument();
6815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006817 if (PyUnicode_READY(unicode) == -1)
6818 return NULL;
6819 /* Fast path: if it is an ASCII-only string, construct bytes object
6820 directly. Else defer to above function to raise the exception. */
6821 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6822 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6823 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006824 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006825}
6826
6827PyObject *
6828PyUnicode_AsASCIIString(PyObject *unicode)
6829{
6830 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006831}
6832
Victor Stinner99b95382011-07-04 14:23:54 +02006833#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006834
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006835/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006836
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006837#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006838#define NEED_RETRY
6839#endif
6840
Victor Stinner3a50e702011-10-18 21:21:00 +02006841#ifndef WC_ERR_INVALID_CHARS
6842# define WC_ERR_INVALID_CHARS 0x0080
6843#endif
6844
6845static char*
6846code_page_name(UINT code_page, PyObject **obj)
6847{
6848 *obj = NULL;
6849 if (code_page == CP_ACP)
6850 return "mbcs";
6851 if (code_page == CP_UTF7)
6852 return "CP_UTF7";
6853 if (code_page == CP_UTF8)
6854 return "CP_UTF8";
6855
6856 *obj = PyBytes_FromFormat("cp%u", code_page);
6857 if (*obj == NULL)
6858 return NULL;
6859 return PyBytes_AS_STRING(*obj);
6860}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861
Alexander Belopolsky40018472011-02-26 01:02:56 +00006862static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006863is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006864{
6865 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006866 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006867
Victor Stinner3a50e702011-10-18 21:21:00 +02006868 if (!IsDBCSLeadByteEx(code_page, *curr))
6869 return 0;
6870
6871 prev = CharPrevExA(code_page, s, curr, 0);
6872 if (prev == curr)
6873 return 1;
6874 /* FIXME: This code is limited to "true" double-byte encodings,
6875 as it assumes an incomplete character consists of a single
6876 byte. */
6877 if (curr - prev == 2)
6878 return 1;
6879 if (!IsDBCSLeadByteEx(code_page, *prev))
6880 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881 return 0;
6882}
6883
Victor Stinner3a50e702011-10-18 21:21:00 +02006884static DWORD
6885decode_code_page_flags(UINT code_page)
6886{
6887 if (code_page == CP_UTF7) {
6888 /* The CP_UTF7 decoder only supports flags=0 */
6889 return 0;
6890 }
6891 else
6892 return MB_ERR_INVALID_CHARS;
6893}
6894
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006895/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006896 * Decode a byte string from a Windows code page into unicode object in strict
6897 * mode.
6898 *
6899 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6900 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006902static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006903decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006904 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006905 const char *in,
6906 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006907{
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006909 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006910 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006911
6912 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006913 assert(insize > 0);
6914 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6915 if (outsize <= 0)
6916 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006917
6918 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006920 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006921 if (*v == NULL)
6922 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006923 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006924 }
6925 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006927 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006928 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006930 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006931 }
6932
6933 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006934 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6935 if (outsize <= 0)
6936 goto error;
6937 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006938
Victor Stinner3a50e702011-10-18 21:21:00 +02006939error:
6940 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6941 return -2;
6942 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006943 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006944}
6945
Victor Stinner3a50e702011-10-18 21:21:00 +02006946/*
6947 * Decode a byte string from a code page into unicode object with an error
6948 * handler.
6949 *
6950 * Returns consumed size if succeed, or raise a WindowsError or
6951 * UnicodeDecodeError exception and returns -1 on error.
6952 */
6953static int
6954decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006955 PyObject **v,
6956 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006957 const char *errors)
6958{
6959 const char *startin = in;
6960 const char *endin = in + size;
6961 const DWORD flags = decode_code_page_flags(code_page);
6962 /* Ideally, we should get reason from FormatMessage. This is the Windows
6963 2000 English version of the message. */
6964 const char *reason = "No mapping for the Unicode character exists "
6965 "in the target code page.";
6966 /* each step cannot decode more than 1 character, but a character can be
6967 represented as a surrogate pair */
6968 wchar_t buffer[2], *startout, *out;
6969 int insize, outsize;
6970 PyObject *errorHandler = NULL;
6971 PyObject *exc = NULL;
6972 PyObject *encoding_obj = NULL;
6973 char *encoding;
6974 DWORD err;
6975 int ret = -1;
6976
6977 assert(size > 0);
6978
6979 encoding = code_page_name(code_page, &encoding_obj);
6980 if (encoding == NULL)
6981 return -1;
6982
6983 if (errors == NULL || strcmp(errors, "strict") == 0) {
6984 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6985 UnicodeDecodeError. */
6986 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6987 if (exc != NULL) {
6988 PyCodec_StrictErrors(exc);
6989 Py_CLEAR(exc);
6990 }
6991 goto error;
6992 }
6993
6994 if (*v == NULL) {
6995 /* Create unicode object */
6996 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6997 PyErr_NoMemory();
6998 goto error;
6999 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007000 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007001 if (*v == NULL)
7002 goto error;
7003 startout = PyUnicode_AS_UNICODE(*v);
7004 }
7005 else {
7006 /* Extend unicode object */
7007 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7008 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7009 PyErr_NoMemory();
7010 goto error;
7011 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007012 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007013 goto error;
7014 startout = PyUnicode_AS_UNICODE(*v) + n;
7015 }
7016
7017 /* Decode the byte string character per character */
7018 out = startout;
7019 while (in < endin)
7020 {
7021 /* Decode a character */
7022 insize = 1;
7023 do
7024 {
7025 outsize = MultiByteToWideChar(code_page, flags,
7026 in, insize,
7027 buffer, Py_ARRAY_LENGTH(buffer));
7028 if (outsize > 0)
7029 break;
7030 err = GetLastError();
7031 if (err != ERROR_NO_UNICODE_TRANSLATION
7032 && err != ERROR_INSUFFICIENT_BUFFER)
7033 {
7034 PyErr_SetFromWindowsErr(0);
7035 goto error;
7036 }
7037 insize++;
7038 }
7039 /* 4=maximum length of a UTF-8 sequence */
7040 while (insize <= 4 && (in + insize) <= endin);
7041
7042 if (outsize <= 0) {
7043 Py_ssize_t startinpos, endinpos, outpos;
7044
7045 startinpos = in - startin;
7046 endinpos = startinpos + 1;
7047 outpos = out - PyUnicode_AS_UNICODE(*v);
7048 if (unicode_decode_call_errorhandler(
7049 errors, &errorHandler,
7050 encoding, reason,
7051 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007052 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007053 {
7054 goto error;
7055 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007056 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007057 }
7058 else {
7059 in += insize;
7060 memcpy(out, buffer, outsize * sizeof(wchar_t));
7061 out += outsize;
7062 }
7063 }
7064
7065 /* write a NUL character at the end */
7066 *out = 0;
7067
7068 /* Extend unicode object */
7069 outsize = out - startout;
7070 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007071 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007072 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007073 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007074
7075error:
7076 Py_XDECREF(encoding_obj);
7077 Py_XDECREF(errorHandler);
7078 Py_XDECREF(exc);
7079 return ret;
7080}
7081
Victor Stinner3a50e702011-10-18 21:21:00 +02007082static PyObject *
7083decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007084 const char *s, Py_ssize_t size,
7085 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007086{
Victor Stinner76a31a62011-11-04 00:05:13 +01007087 PyObject *v = NULL;
7088 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 if (code_page < 0) {
7091 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7092 return NULL;
7093 }
7094
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007096 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007097
Victor Stinner76a31a62011-11-04 00:05:13 +01007098 do
7099 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007100#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007101 if (size > INT_MAX) {
7102 chunk_size = INT_MAX;
7103 final = 0;
7104 done = 0;
7105 }
7106 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007108 {
7109 chunk_size = (int)size;
7110 final = (consumed == NULL);
7111 done = 1;
7112 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113
Victor Stinner76a31a62011-11-04 00:05:13 +01007114 /* Skip trailing lead-byte unless 'final' is set */
7115 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7116 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117
Victor Stinner76a31a62011-11-04 00:05:13 +01007118 if (chunk_size == 0 && done) {
7119 if (v != NULL)
7120 break;
7121 Py_INCREF(unicode_empty);
7122 return unicode_empty;
7123 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007124
Victor Stinner76a31a62011-11-04 00:05:13 +01007125
7126 converted = decode_code_page_strict(code_page, &v,
7127 s, chunk_size);
7128 if (converted == -2)
7129 converted = decode_code_page_errors(code_page, &v,
7130 s, chunk_size,
7131 errors);
7132 assert(converted != 0);
7133
7134 if (converted < 0) {
7135 Py_XDECREF(v);
7136 return NULL;
7137 }
7138
7139 if (consumed)
7140 *consumed += converted;
7141
7142 s += converted;
7143 size -= converted;
7144 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007145
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007146 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007147}
7148
Alexander Belopolsky40018472011-02-26 01:02:56 +00007149PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007150PyUnicode_DecodeCodePageStateful(int code_page,
7151 const char *s,
7152 Py_ssize_t size,
7153 const char *errors,
7154 Py_ssize_t *consumed)
7155{
7156 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7157}
7158
7159PyObject *
7160PyUnicode_DecodeMBCSStateful(const char *s,
7161 Py_ssize_t size,
7162 const char *errors,
7163 Py_ssize_t *consumed)
7164{
7165 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7166}
7167
7168PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007169PyUnicode_DecodeMBCS(const char *s,
7170 Py_ssize_t size,
7171 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007172{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007173 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7174}
7175
Victor Stinner3a50e702011-10-18 21:21:00 +02007176static DWORD
7177encode_code_page_flags(UINT code_page, const char *errors)
7178{
7179 if (code_page == CP_UTF8) {
7180 if (winver.dwMajorVersion >= 6)
7181 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7182 and later */
7183 return WC_ERR_INVALID_CHARS;
7184 else
7185 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7186 return 0;
7187 }
7188 else if (code_page == CP_UTF7) {
7189 /* CP_UTF7 only supports flags=0 */
7190 return 0;
7191 }
7192 else {
7193 if (errors != NULL && strcmp(errors, "replace") == 0)
7194 return 0;
7195 else
7196 return WC_NO_BEST_FIT_CHARS;
7197 }
7198}
7199
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007200/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007201 * Encode a Unicode string to a Windows code page into a byte string in strict
7202 * mode.
7203 *
7204 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7205 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007206 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007207static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007208encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007209 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007211{
Victor Stinner554f3f02010-06-16 23:33:54 +00007212 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 BOOL *pusedDefaultChar = &usedDefaultChar;
7214 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007215 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007216 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007217 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007218 const DWORD flags = encode_code_page_flags(code_page, NULL);
7219 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 /* Create a substring so that we can get the UTF-16 representation
7221 of just the slice under consideration. */
7222 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007223
Martin v. Löwis3d325192011-11-04 18:23:06 +01007224 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007225
Victor Stinner3a50e702011-10-18 21:21:00 +02007226 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007227 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007229 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007230
Victor Stinner2fc507f2011-11-04 20:06:39 +01007231 substring = PyUnicode_Substring(unicode, offset, offset+len);
7232 if (substring == NULL)
7233 return -1;
7234 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7235 if (p == NULL) {
7236 Py_DECREF(substring);
7237 return -1;
7238 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007239
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007240 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 outsize = WideCharToMultiByte(code_page, flags,
7242 p, size,
7243 NULL, 0,
7244 NULL, pusedDefaultChar);
7245 if (outsize <= 0)
7246 goto error;
7247 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007248 if (pusedDefaultChar && *pusedDefaultChar) {
7249 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007251 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007252
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007254 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007256 if (*outbytes == NULL) {
7257 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007259 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007260 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007261 }
7262 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007263 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 const Py_ssize_t n = PyBytes_Size(*outbytes);
7265 if (outsize > PY_SSIZE_T_MAX - n) {
7266 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007267 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007268 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007269 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007270 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7271 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007273 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007275 }
7276
7277 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 outsize = WideCharToMultiByte(code_page, flags,
7279 p, size,
7280 out, outsize,
7281 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007282 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007283 if (outsize <= 0)
7284 goto error;
7285 if (pusedDefaultChar && *pusedDefaultChar)
7286 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007288
Victor Stinner3a50e702011-10-18 21:21:00 +02007289error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007290 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7292 return -2;
7293 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007294 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007295}
7296
Victor Stinner3a50e702011-10-18 21:21:00 +02007297/*
7298 * Encode a Unicode string to a Windows code page into a byte string using a
7299 * error handler.
7300 *
7301 * Returns consumed characters if succeed, or raise a WindowsError and returns
7302 * -1 on other error.
7303 */
7304static int
7305encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007306 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007307 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007308{
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007310 Py_ssize_t pos = unicode_offset;
7311 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 /* Ideally, we should get reason from FormatMessage. This is the Windows
7313 2000 English version of the message. */
7314 const char *reason = "invalid character";
7315 /* 4=maximum length of a UTF-8 sequence */
7316 char buffer[4];
7317 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7318 Py_ssize_t outsize;
7319 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007320 PyObject *errorHandler = NULL;
7321 PyObject *exc = NULL;
7322 PyObject *encoding_obj = NULL;
7323 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007324 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007325 PyObject *rep;
7326 int ret = -1;
7327
7328 assert(insize > 0);
7329
7330 encoding = code_page_name(code_page, &encoding_obj);
7331 if (encoding == NULL)
7332 return -1;
7333
7334 if (errors == NULL || strcmp(errors, "strict") == 0) {
7335 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7336 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007337 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 if (exc != NULL) {
7339 PyCodec_StrictErrors(exc);
7340 Py_DECREF(exc);
7341 }
7342 Py_XDECREF(encoding_obj);
7343 return -1;
7344 }
7345
7346 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7347 pusedDefaultChar = &usedDefaultChar;
7348 else
7349 pusedDefaultChar = NULL;
7350
7351 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7352 PyErr_NoMemory();
7353 goto error;
7354 }
7355 outsize = insize * Py_ARRAY_LENGTH(buffer);
7356
7357 if (*outbytes == NULL) {
7358 /* Create string object */
7359 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7360 if (*outbytes == NULL)
7361 goto error;
7362 out = PyBytes_AS_STRING(*outbytes);
7363 }
7364 else {
7365 /* Extend string object */
7366 Py_ssize_t n = PyBytes_Size(*outbytes);
7367 if (n > PY_SSIZE_T_MAX - outsize) {
7368 PyErr_NoMemory();
7369 goto error;
7370 }
7371 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7372 goto error;
7373 out = PyBytes_AS_STRING(*outbytes) + n;
7374 }
7375
7376 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007377 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007378 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007379 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7380 wchar_t chars[2];
7381 int charsize;
7382 if (ch < 0x10000) {
7383 chars[0] = (wchar_t)ch;
7384 charsize = 1;
7385 }
7386 else {
7387 ch -= 0x10000;
7388 chars[0] = 0xd800 + (ch >> 10);
7389 chars[1] = 0xdc00 + (ch & 0x3ff);
7390 charsize = 2;
7391 }
7392
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007394 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007395 buffer, Py_ARRAY_LENGTH(buffer),
7396 NULL, pusedDefaultChar);
7397 if (outsize > 0) {
7398 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7399 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007400 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007401 memcpy(out, buffer, outsize);
7402 out += outsize;
7403 continue;
7404 }
7405 }
7406 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7407 PyErr_SetFromWindowsErr(0);
7408 goto error;
7409 }
7410
Victor Stinner3a50e702011-10-18 21:21:00 +02007411 rep = unicode_encode_call_errorhandler(
7412 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007413 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007414 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007415 if (rep == NULL)
7416 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007417 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007418
7419 if (PyBytes_Check(rep)) {
7420 outsize = PyBytes_GET_SIZE(rep);
7421 if (outsize != 1) {
7422 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7423 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7424 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7425 Py_DECREF(rep);
7426 goto error;
7427 }
7428 out = PyBytes_AS_STRING(*outbytes) + offset;
7429 }
7430 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7431 out += outsize;
7432 }
7433 else {
7434 Py_ssize_t i;
7435 enum PyUnicode_Kind kind;
7436 void *data;
7437
7438 if (PyUnicode_READY(rep) < 0) {
7439 Py_DECREF(rep);
7440 goto error;
7441 }
7442
7443 outsize = PyUnicode_GET_LENGTH(rep);
7444 if (outsize != 1) {
7445 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7446 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7447 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7448 Py_DECREF(rep);
7449 goto error;
7450 }
7451 out = PyBytes_AS_STRING(*outbytes) + offset;
7452 }
7453 kind = PyUnicode_KIND(rep);
7454 data = PyUnicode_DATA(rep);
7455 for (i=0; i < outsize; i++) {
7456 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7457 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007458 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007459 encoding, unicode,
7460 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007461 "unable to encode error handler result to ASCII");
7462 Py_DECREF(rep);
7463 goto error;
7464 }
7465 *out = (unsigned char)ch;
7466 out++;
7467 }
7468 }
7469 Py_DECREF(rep);
7470 }
7471 /* write a NUL byte */
7472 *out = 0;
7473 outsize = out - PyBytes_AS_STRING(*outbytes);
7474 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7475 if (_PyBytes_Resize(outbytes, outsize) < 0)
7476 goto error;
7477 ret = 0;
7478
7479error:
7480 Py_XDECREF(encoding_obj);
7481 Py_XDECREF(errorHandler);
7482 Py_XDECREF(exc);
7483 return ret;
7484}
7485
Victor Stinner3a50e702011-10-18 21:21:00 +02007486static PyObject *
7487encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007488 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 const char *errors)
7490{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007491 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007493 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007494 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007495
Victor Stinner2fc507f2011-11-04 20:06:39 +01007496 if (PyUnicode_READY(unicode) < 0)
7497 return NULL;
7498 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007499
Victor Stinner3a50e702011-10-18 21:21:00 +02007500 if (code_page < 0) {
7501 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7502 return NULL;
7503 }
7504
Martin v. Löwis3d325192011-11-04 18:23:06 +01007505 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007506 return PyBytes_FromStringAndSize(NULL, 0);
7507
Victor Stinner7581cef2011-11-03 22:32:33 +01007508 offset = 0;
7509 do
7510 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007511#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007512 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513 chunks. */
7514 if (len > INT_MAX/2) {
7515 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007516 done = 0;
7517 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007518 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007519#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007520 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007521 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007522 done = 1;
7523 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007524
Victor Stinner76a31a62011-11-04 00:05:13 +01007525 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007526 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007527 errors);
7528 if (ret == -2)
7529 ret = encode_code_page_errors(code_page, &outbytes,
7530 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007531 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007532 if (ret < 0) {
7533 Py_XDECREF(outbytes);
7534 return NULL;
7535 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007536
Victor Stinner7581cef2011-11-03 22:32:33 +01007537 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007538 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007539 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007540
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 return outbytes;
7542}
7543
7544PyObject *
7545PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7546 Py_ssize_t size,
7547 const char *errors)
7548{
Victor Stinner7581cef2011-11-03 22:32:33 +01007549 PyObject *unicode, *res;
7550 unicode = PyUnicode_FromUnicode(p, size);
7551 if (unicode == NULL)
7552 return NULL;
7553 res = encode_code_page(CP_ACP, unicode, errors);
7554 Py_DECREF(unicode);
7555 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007556}
7557
7558PyObject *
7559PyUnicode_EncodeCodePage(int code_page,
7560 PyObject *unicode,
7561 const char *errors)
7562{
Victor Stinner7581cef2011-11-03 22:32:33 +01007563 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007564}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007565
Alexander Belopolsky40018472011-02-26 01:02:56 +00007566PyObject *
7567PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007568{
7569 if (!PyUnicode_Check(unicode)) {
7570 PyErr_BadArgument();
7571 return NULL;
7572 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007573 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007574}
7575
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007576#undef NEED_RETRY
7577
Victor Stinner99b95382011-07-04 14:23:54 +02007578#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007579
Guido van Rossumd57fd912000-03-10 22:53:23 +00007580/* --- Character Mapping Codec -------------------------------------------- */
7581
Alexander Belopolsky40018472011-02-26 01:02:56 +00007582PyObject *
7583PyUnicode_DecodeCharmap(const char *s,
7584 Py_ssize_t size,
7585 PyObject *mapping,
7586 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007588 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007589 Py_ssize_t startinpos;
7590 Py_ssize_t endinpos;
7591 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007592 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007593 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007594 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007595 PyObject *errorHandler = NULL;
7596 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007597
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 /* Default to Latin-1 */
7599 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007601
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007602 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007606 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007607 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007608 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007609 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007610 Py_ssize_t maplen;
7611 enum PyUnicode_Kind kind;
7612 void *data;
7613 Py_UCS4 x;
7614
7615 if (PyUnicode_READY(mapping) < 0)
7616 return NULL;
7617
7618 maplen = PyUnicode_GET_LENGTH(mapping);
7619 data = PyUnicode_DATA(mapping);
7620 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 while (s < e) {
7622 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007623
Benjamin Peterson29060642009-01-31 22:14:21 +00007624 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007625 x = PyUnicode_READ(kind, data, ch);
7626 else
7627 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007628
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007629 if (x == 0xfffe)
7630 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007632 startinpos = s-starts;
7633 endinpos = startinpos+1;
7634 if (unicode_decode_call_errorhandler(
7635 errors, &errorHandler,
7636 "charmap", "character maps to <undefined>",
7637 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007638 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007639 goto onError;
7640 }
7641 continue;
7642 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007643
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007644 if (unicode_putchar(&v, &outpos, x) < 0)
7645 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007647 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007648 }
7649 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 while (s < e) {
7651 unsigned char ch = *s;
7652 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007653
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7655 w = PyLong_FromLong((long)ch);
7656 if (w == NULL)
7657 goto onError;
7658 x = PyObject_GetItem(mapping, w);
7659 Py_DECREF(w);
7660 if (x == NULL) {
7661 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7662 /* No mapping found means: mapping is undefined. */
7663 PyErr_Clear();
7664 x = Py_None;
7665 Py_INCREF(x);
7666 } else
7667 goto onError;
7668 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007669
Benjamin Peterson29060642009-01-31 22:14:21 +00007670 /* Apply mapping */
7671 if (PyLong_Check(x)) {
7672 long value = PyLong_AS_LONG(x);
7673 if (value < 0 || value > 65535) {
7674 PyErr_SetString(PyExc_TypeError,
7675 "character mapping must be in range(65536)");
7676 Py_DECREF(x);
7677 goto onError;
7678 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007679 if (unicode_putchar(&v, &outpos, value) < 0)
7680 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 }
7682 else if (x == Py_None) {
7683 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 startinpos = s-starts;
7685 endinpos = startinpos+1;
7686 if (unicode_decode_call_errorhandler(
7687 errors, &errorHandler,
7688 "charmap", "character maps to <undefined>",
7689 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007690 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 Py_DECREF(x);
7692 goto onError;
7693 }
7694 Py_DECREF(x);
7695 continue;
7696 }
7697 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007698 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007699
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007700 if (PyUnicode_READY(x) < 0)
7701 goto onError;
7702 targetsize = PyUnicode_GET_LENGTH(x);
7703
7704 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007706 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007707 PyUnicode_READ_CHAR(x, 0)) < 0)
7708 goto onError;
7709 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 else if (targetsize > 1) {
7711 /* 1-n mapping */
7712 if (targetsize > extrachars) {
7713 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 Py_ssize_t needed = (targetsize - extrachars) + \
7715 (targetsize << 2);
7716 extrachars += needed;
7717 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007718 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007719 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007720 Py_DECREF(x);
7721 goto onError;
7722 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007724 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7725 goto onError;
7726 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7727 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 extrachars -= targetsize;
7729 }
7730 /* 1-0 mapping: skip the character */
7731 }
7732 else {
7733 /* wrong return value */
7734 PyErr_SetString(PyExc_TypeError,
7735 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007736 Py_DECREF(x);
7737 goto onError;
7738 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 Py_DECREF(x);
7740 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007741 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007743 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007744 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007745 Py_XDECREF(errorHandler);
7746 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007747 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007748
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007750 Py_XDECREF(errorHandler);
7751 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752 Py_XDECREF(v);
7753 return NULL;
7754}
7755
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007756/* Charmap encoding: the lookup table */
7757
Alexander Belopolsky40018472011-02-26 01:02:56 +00007758struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007759 PyObject_HEAD
7760 unsigned char level1[32];
7761 int count2, count3;
7762 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007763};
7764
7765static PyObject*
7766encoding_map_size(PyObject *obj, PyObject* args)
7767{
7768 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007769 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007771}
7772
7773static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007774 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 PyDoc_STR("Return the size (in bytes) of this object") },
7776 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007777};
7778
7779static void
7780encoding_map_dealloc(PyObject* o)
7781{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007782 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007783}
7784
7785static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007786 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 "EncodingMap", /*tp_name*/
7788 sizeof(struct encoding_map), /*tp_basicsize*/
7789 0, /*tp_itemsize*/
7790 /* methods */
7791 encoding_map_dealloc, /*tp_dealloc*/
7792 0, /*tp_print*/
7793 0, /*tp_getattr*/
7794 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007795 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007796 0, /*tp_repr*/
7797 0, /*tp_as_number*/
7798 0, /*tp_as_sequence*/
7799 0, /*tp_as_mapping*/
7800 0, /*tp_hash*/
7801 0, /*tp_call*/
7802 0, /*tp_str*/
7803 0, /*tp_getattro*/
7804 0, /*tp_setattro*/
7805 0, /*tp_as_buffer*/
7806 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7807 0, /*tp_doc*/
7808 0, /*tp_traverse*/
7809 0, /*tp_clear*/
7810 0, /*tp_richcompare*/
7811 0, /*tp_weaklistoffset*/
7812 0, /*tp_iter*/
7813 0, /*tp_iternext*/
7814 encoding_map_methods, /*tp_methods*/
7815 0, /*tp_members*/
7816 0, /*tp_getset*/
7817 0, /*tp_base*/
7818 0, /*tp_dict*/
7819 0, /*tp_descr_get*/
7820 0, /*tp_descr_set*/
7821 0, /*tp_dictoffset*/
7822 0, /*tp_init*/
7823 0, /*tp_alloc*/
7824 0, /*tp_new*/
7825 0, /*tp_free*/
7826 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007827};
7828
7829PyObject*
7830PyUnicode_BuildEncodingMap(PyObject* string)
7831{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007832 PyObject *result;
7833 struct encoding_map *mresult;
7834 int i;
7835 int need_dict = 0;
7836 unsigned char level1[32];
7837 unsigned char level2[512];
7838 unsigned char *mlevel1, *mlevel2, *mlevel3;
7839 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007840 int kind;
7841 void *data;
7842 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007845 PyErr_BadArgument();
7846 return NULL;
7847 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 kind = PyUnicode_KIND(string);
7849 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 memset(level1, 0xFF, sizeof level1);
7851 memset(level2, 0xFF, sizeof level2);
7852
7853 /* If there isn't a one-to-one mapping of NULL to \0,
7854 or if there are non-BMP characters, we need to use
7855 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007856 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007857 need_dict = 1;
7858 for (i = 1; i < 256; i++) {
7859 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 ch = PyUnicode_READ(kind, data, i);
7861 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007862 need_dict = 1;
7863 break;
7864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007865 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007866 /* unmapped character */
7867 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007868 l1 = ch >> 11;
7869 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007870 if (level1[l1] == 0xFF)
7871 level1[l1] = count2++;
7872 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007873 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007874 }
7875
7876 if (count2 >= 0xFF || count3 >= 0xFF)
7877 need_dict = 1;
7878
7879 if (need_dict) {
7880 PyObject *result = PyDict_New();
7881 PyObject *key, *value;
7882 if (!result)
7883 return NULL;
7884 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007885 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007886 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887 if (!key || !value)
7888 goto failed1;
7889 if (PyDict_SetItem(result, key, value) == -1)
7890 goto failed1;
7891 Py_DECREF(key);
7892 Py_DECREF(value);
7893 }
7894 return result;
7895 failed1:
7896 Py_XDECREF(key);
7897 Py_XDECREF(value);
7898 Py_DECREF(result);
7899 return NULL;
7900 }
7901
7902 /* Create a three-level trie */
7903 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7904 16*count2 + 128*count3 - 1);
7905 if (!result)
7906 return PyErr_NoMemory();
7907 PyObject_Init(result, &EncodingMapType);
7908 mresult = (struct encoding_map*)result;
7909 mresult->count2 = count2;
7910 mresult->count3 = count3;
7911 mlevel1 = mresult->level1;
7912 mlevel2 = mresult->level23;
7913 mlevel3 = mresult->level23 + 16*count2;
7914 memcpy(mlevel1, level1, 32);
7915 memset(mlevel2, 0xFF, 16*count2);
7916 memset(mlevel3, 0, 128*count3);
7917 count3 = 0;
7918 for (i = 1; i < 256; i++) {
7919 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007921 /* unmapped character */
7922 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007923 o1 = PyUnicode_READ(kind, data, i)>>11;
7924 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007925 i2 = 16*mlevel1[o1] + o2;
7926 if (mlevel2[i2] == 0xFF)
7927 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007928 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007929 i3 = 128*mlevel2[i2] + o3;
7930 mlevel3[i3] = i;
7931 }
7932 return result;
7933}
7934
7935static int
Victor Stinner22168992011-11-20 17:09:18 +01007936encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007937{
7938 struct encoding_map *map = (struct encoding_map*)mapping;
7939 int l1 = c>>11;
7940 int l2 = (c>>7) & 0xF;
7941 int l3 = c & 0x7F;
7942 int i;
7943
Victor Stinner22168992011-11-20 17:09:18 +01007944 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007946 if (c == 0)
7947 return 0;
7948 /* level 1*/
7949 i = map->level1[l1];
7950 if (i == 0xFF) {
7951 return -1;
7952 }
7953 /* level 2*/
7954 i = map->level23[16*i+l2];
7955 if (i == 0xFF) {
7956 return -1;
7957 }
7958 /* level 3 */
7959 i = map->level23[16*map->count2 + 128*i + l3];
7960 if (i == 0) {
7961 return -1;
7962 }
7963 return i;
7964}
7965
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007966/* Lookup the character ch in the mapping. If the character
7967 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007968 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007969static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007970charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971{
Christian Heimes217cfd12007-12-02 14:31:20 +00007972 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007973 PyObject *x;
7974
7975 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007977 x = PyObject_GetItem(mapping, w);
7978 Py_DECREF(w);
7979 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7981 /* No mapping found means: mapping is undefined. */
7982 PyErr_Clear();
7983 x = Py_None;
7984 Py_INCREF(x);
7985 return x;
7986 } else
7987 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007989 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007991 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 long value = PyLong_AS_LONG(x);
7993 if (value < 0 || value > 255) {
7994 PyErr_SetString(PyExc_TypeError,
7995 "character mapping must be in range(256)");
7996 Py_DECREF(x);
7997 return NULL;
7998 }
7999 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008000 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008001 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008003 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 /* wrong return value */
8005 PyErr_Format(PyExc_TypeError,
8006 "character mapping must return integer, bytes or None, not %.400s",
8007 x->ob_type->tp_name);
8008 Py_DECREF(x);
8009 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010 }
8011}
8012
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008013static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008014charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008015{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8017 /* exponentially overallocate to minimize reallocations */
8018 if (requiredsize < 2*outsize)
8019 requiredsize = 2*outsize;
8020 if (_PyBytes_Resize(outobj, requiredsize))
8021 return -1;
8022 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008023}
8024
Benjamin Peterson14339b62009-01-31 16:36:08 +00008025typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008027} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008028/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008029 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030 space is available. Return a new reference to the object that
8031 was put in the output buffer, or Py_None, if the mapping was undefined
8032 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008033 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008034static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008035charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008036 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008037{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038 PyObject *rep;
8039 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008040 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008041
Christian Heimes90aa7642007-12-19 02:45:37 +00008042 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008045 if (res == -1)
8046 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 if (outsize<requiredsize)
8048 if (charmapencode_resize(outobj, outpos, requiredsize))
8049 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008050 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 outstart[(*outpos)++] = (char)res;
8052 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053 }
8054
8055 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008056 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 Py_DECREF(rep);
8060 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008061 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 if (PyLong_Check(rep)) {
8063 Py_ssize_t requiredsize = *outpos+1;
8064 if (outsize<requiredsize)
8065 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8066 Py_DECREF(rep);
8067 return enc_EXCEPTION;
8068 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008069 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008071 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 else {
8073 const char *repchars = PyBytes_AS_STRING(rep);
8074 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8075 Py_ssize_t requiredsize = *outpos+repsize;
8076 if (outsize<requiredsize)
8077 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8078 Py_DECREF(rep);
8079 return enc_EXCEPTION;
8080 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008081 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 memcpy(outstart + *outpos, repchars, repsize);
8083 *outpos += repsize;
8084 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008085 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086 Py_DECREF(rep);
8087 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088}
8089
8090/* handle an error in PyUnicode_EncodeCharmap
8091 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008092static int
8093charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008094 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008096 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008097 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098{
8099 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008100 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008101 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008102 enum PyUnicode_Kind kind;
8103 void *data;
8104 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008105 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008106 Py_ssize_t collstartpos = *inpos;
8107 Py_ssize_t collendpos = *inpos+1;
8108 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109 char *encoding = "charmap";
8110 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008111 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008112 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008113 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008114
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008115 if (PyUnicode_READY(unicode) < 0)
8116 return -1;
8117 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008118 /* find all unencodable characters */
8119 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008121 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008122 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008123 val = encoding_map_lookup(ch, mapping);
8124 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 break;
8126 ++collendpos;
8127 continue;
8128 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008129
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008130 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8131 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 if (rep==NULL)
8133 return -1;
8134 else if (rep!=Py_None) {
8135 Py_DECREF(rep);
8136 break;
8137 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008138 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 }
8141 /* cache callback name lookup
8142 * (if not done yet, i.e. it's the first error) */
8143 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 if ((errors==NULL) || (!strcmp(errors, "strict")))
8145 *known_errorHandler = 1;
8146 else if (!strcmp(errors, "replace"))
8147 *known_errorHandler = 2;
8148 else if (!strcmp(errors, "ignore"))
8149 *known_errorHandler = 3;
8150 else if (!strcmp(errors, "xmlcharrefreplace"))
8151 *known_errorHandler = 4;
8152 else
8153 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008154 }
8155 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008156 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008157 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008158 return -1;
8159 case 2: /* replace */
8160 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 x = charmapencode_output('?', mapping, res, respos);
8162 if (x==enc_EXCEPTION) {
8163 return -1;
8164 }
8165 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008166 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 return -1;
8168 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 }
8170 /* fall through */
8171 case 3: /* ignore */
8172 *inpos = collendpos;
8173 break;
8174 case 4: /* xmlcharrefreplace */
8175 /* generate replacement (temporarily (mis)uses p) */
8176 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 char buffer[2+29+1+1];
8178 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008179 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 for (cp = buffer; *cp; ++cp) {
8181 x = charmapencode_output(*cp, mapping, res, respos);
8182 if (x==enc_EXCEPTION)
8183 return -1;
8184 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008185 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 return -1;
8187 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188 }
8189 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008190 *inpos = collendpos;
8191 break;
8192 default:
8193 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008194 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008196 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008198 if (PyBytes_Check(repunicode)) {
8199 /* Directly copy bytes result to output. */
8200 Py_ssize_t outsize = PyBytes_Size(*res);
8201 Py_ssize_t requiredsize;
8202 repsize = PyBytes_Size(repunicode);
8203 requiredsize = *respos + repsize;
8204 if (requiredsize > outsize)
8205 /* Make room for all additional bytes. */
8206 if (charmapencode_resize(res, respos, requiredsize)) {
8207 Py_DECREF(repunicode);
8208 return -1;
8209 }
8210 memcpy(PyBytes_AsString(*res) + *respos,
8211 PyBytes_AsString(repunicode), repsize);
8212 *respos += repsize;
8213 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008214 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008215 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008216 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008217 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008218 if (PyUnicode_READY(repunicode) < 0) {
8219 Py_DECREF(repunicode);
8220 return -1;
8221 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008222 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008223 data = PyUnicode_DATA(repunicode);
8224 kind = PyUnicode_KIND(repunicode);
8225 for (index = 0; index < repsize; index++) {
8226 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8227 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008229 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 return -1;
8231 }
8232 else if (x==enc_FAILED) {
8233 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008234 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 return -1;
8236 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008237 }
8238 *inpos = newpos;
8239 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 }
8241 return 0;
8242}
8243
Alexander Belopolsky40018472011-02-26 01:02:56 +00008244PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008245_PyUnicode_EncodeCharmap(PyObject *unicode,
8246 PyObject *mapping,
8247 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008248{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 /* output object */
8250 PyObject *res = NULL;
8251 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008252 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008253 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008255 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256 PyObject *errorHandler = NULL;
8257 PyObject *exc = NULL;
8258 /* the following variable is used for caching string comparisons
8259 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8260 * 3=ignore, 4=xmlcharrefreplace */
8261 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008262
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008263 if (PyUnicode_READY(unicode) < 0)
8264 return NULL;
8265 size = PyUnicode_GET_LENGTH(unicode);
8266
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267 /* Default to Latin-1 */
8268 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008269 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008270
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 /* allocate enough for a simple encoding without
8272 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008273 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008274 if (res == NULL)
8275 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008276 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008280 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008282 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 if (x==enc_EXCEPTION) /* error */
8284 goto onError;
8285 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008286 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 &exc,
8288 &known_errorHandler, &errorHandler, errors,
8289 &res, &respos)) {
8290 goto onError;
8291 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008292 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008293 else
8294 /* done with this character => adjust input position */
8295 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008299 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008300 if (_PyBytes_Resize(&res, respos) < 0)
8301 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008302
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303 Py_XDECREF(exc);
8304 Py_XDECREF(errorHandler);
8305 return res;
8306
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008308 Py_XDECREF(res);
8309 Py_XDECREF(exc);
8310 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 return NULL;
8312}
8313
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008314/* Deprecated */
8315PyObject *
8316PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8317 Py_ssize_t size,
8318 PyObject *mapping,
8319 const char *errors)
8320{
8321 PyObject *result;
8322 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8323 if (unicode == NULL)
8324 return NULL;
8325 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8326 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008327 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008328}
8329
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330PyObject *
8331PyUnicode_AsCharmapString(PyObject *unicode,
8332 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333{
8334 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 PyErr_BadArgument();
8336 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008338 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339}
8340
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008342static void
8343make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008344 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008345 Py_ssize_t startpos, Py_ssize_t endpos,
8346 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349 *exceptionObject = _PyUnicodeTranslateError_Create(
8350 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 }
8352 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8354 goto onError;
8355 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8356 goto onError;
8357 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8358 goto onError;
8359 return;
8360 onError:
8361 Py_DECREF(*exceptionObject);
8362 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 }
8364}
8365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008367static void
8368raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008369 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008370 Py_ssize_t startpos, Py_ssize_t endpos,
8371 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372{
8373 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377}
8378
8379/* error handling callback helper:
8380 build arguments, call the callback and check the arguments,
8381 put the result into newpos and return the replacement string, which
8382 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008383static PyObject *
8384unicode_translate_call_errorhandler(const char *errors,
8385 PyObject **errorHandler,
8386 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008388 Py_ssize_t startpos, Py_ssize_t endpos,
8389 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008391 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008393 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 PyObject *restuple;
8395 PyObject *resunicode;
8396
8397 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 }
8402
8403 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407
8408 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008413 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 Py_DECREF(restuple);
8415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 }
8417 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 &resunicode, &i_newpos)) {
8419 Py_DECREF(restuple);
8420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008422 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008423 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008424 else
8425 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008426 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8428 Py_DECREF(restuple);
8429 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008430 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008431 Py_INCREF(resunicode);
8432 Py_DECREF(restuple);
8433 return resunicode;
8434}
8435
8436/* Lookup the character ch in the mapping and put the result in result,
8437 which must be decrefed by the caller.
8438 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008439static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008441{
Christian Heimes217cfd12007-12-02 14:31:20 +00008442 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 PyObject *x;
8444
8445 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447 x = PyObject_GetItem(mapping, w);
8448 Py_DECREF(w);
8449 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8451 /* No mapping found means: use 1:1 mapping. */
8452 PyErr_Clear();
8453 *result = NULL;
8454 return 0;
8455 } else
8456 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457 }
8458 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 *result = x;
8460 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008462 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 long value = PyLong_AS_LONG(x);
8464 long max = PyUnicode_GetMax();
8465 if (value < 0 || value > max) {
8466 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008467 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 Py_DECREF(x);
8469 return -1;
8470 }
8471 *result = x;
8472 return 0;
8473 }
8474 else if (PyUnicode_Check(x)) {
8475 *result = x;
8476 return 0;
8477 }
8478 else {
8479 /* wrong return value */
8480 PyErr_SetString(PyExc_TypeError,
8481 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008482 Py_DECREF(x);
8483 return -1;
8484 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485}
8486/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 if not reallocate and adjust various state variables.
8488 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008489static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008490charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008493 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008494 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 /* exponentially overallocate to minimize reallocations */
8496 if (requiredsize < 2 * oldsize)
8497 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8499 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008502 }
8503 return 0;
8504}
8505/* lookup the character, put the result in the output string and adjust
8506 various state variables. Return a new reference to the object that
8507 was put in the output buffer in *result, or Py_None, if the mapping was
8508 undefined (in which case no character was written).
8509 The called must decref result.
8510 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008511static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8513 PyObject *mapping, Py_UCS4 **output,
8514 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008515 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8518 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008523 }
8524 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008526 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008528 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529 }
8530 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 Py_ssize_t repsize;
8532 if (PyUnicode_READY(*res) == -1)
8533 return -1;
8534 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 if (repsize==1) {
8536 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 }
8539 else if (repsize!=0) {
8540 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 Py_ssize_t requiredsize = *opos +
8542 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 Py_ssize_t i;
8545 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008546 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 for(i = 0; i < repsize; i++)
8548 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 }
8551 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553 return 0;
8554}
8555
Alexander Belopolsky40018472011-02-26 01:02:56 +00008556PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557_PyUnicode_TranslateCharmap(PyObject *input,
8558 PyObject *mapping,
8559 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 /* input object */
8562 char *idata;
8563 Py_ssize_t size, i;
8564 int kind;
8565 /* output buffer */
8566 Py_UCS4 *output = NULL;
8567 Py_ssize_t osize;
8568 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008569 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 char *reason = "character maps to <undefined>";
8572 PyObject *errorHandler = NULL;
8573 PyObject *exc = NULL;
8574 /* the following variable is used for caching string comparisons
8575 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8576 * 3=ignore, 4=xmlcharrefreplace */
8577 int known_errorHandler = -1;
8578
Guido van Rossumd57fd912000-03-10 22:53:23 +00008579 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 PyErr_BadArgument();
8581 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008584 if (PyUnicode_READY(input) == -1)
8585 return NULL;
8586 idata = (char*)PyUnicode_DATA(input);
8587 kind = PyUnicode_KIND(input);
8588 size = PyUnicode_GET_LENGTH(input);
8589 i = 0;
8590
8591 if (size == 0) {
8592 Py_INCREF(input);
8593 return input;
8594 }
8595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596 /* allocate enough for a simple 1:1 translation without
8597 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 osize = size;
8599 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8600 opos = 0;
8601 if (output == NULL) {
8602 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 /* try to encode it */
8608 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 if (charmaptranslate_output(input, i, mapping,
8610 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 Py_XDECREF(x);
8612 goto onError;
8613 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008614 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 else { /* untranslatable character */
8618 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8619 Py_ssize_t repsize;
8620 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 Py_ssize_t collstart = i;
8624 Py_ssize_t collend = i+1;
8625 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 while (collend < size) {
8629 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 goto onError;
8631 Py_XDECREF(x);
8632 if (x!=Py_None)
8633 break;
8634 ++collend;
8635 }
8636 /* cache callback name lookup
8637 * (if not done yet, i.e. it's the first error) */
8638 if (known_errorHandler==-1) {
8639 if ((errors==NULL) || (!strcmp(errors, "strict")))
8640 known_errorHandler = 1;
8641 else if (!strcmp(errors, "replace"))
8642 known_errorHandler = 2;
8643 else if (!strcmp(errors, "ignore"))
8644 known_errorHandler = 3;
8645 else if (!strcmp(errors, "xmlcharrefreplace"))
8646 known_errorHandler = 4;
8647 else
8648 known_errorHandler = 0;
8649 }
8650 switch (known_errorHandler) {
8651 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 raise_translate_exception(&exc, input, collstart,
8653 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008654 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 case 2: /* replace */
8656 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 for (coll = collstart; coll<collend; coll++)
8658 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 /* fall through */
8660 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 break;
8663 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 /* generate replacement (temporarily (mis)uses i) */
8665 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 char buffer[2+29+1+1];
8667 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8669 if (charmaptranslate_makespace(&output, &osize,
8670 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 goto onError;
8672 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008673 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008675 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 break;
8677 default:
8678 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 reason, input, &exc,
8680 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008681 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008683 if (PyUnicode_READY(repunicode) < 0) {
8684 Py_DECREF(repunicode);
8685 goto onError;
8686 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 repsize = PyUnicode_GET_LENGTH(repunicode);
8689 if (charmaptranslate_makespace(&output, &osize,
8690 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 Py_DECREF(repunicode);
8692 goto onError;
8693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008694 for (uni2 = 0; repsize-->0; ++uni2)
8695 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8696 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008698 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008699 }
8700 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8702 if (!res)
8703 goto onError;
8704 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 Py_XDECREF(exc);
8706 Py_XDECREF(errorHandler);
8707 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 Py_XDECREF(exc);
8712 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008713 return NULL;
8714}
8715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716/* Deprecated. Use PyUnicode_Translate instead. */
8717PyObject *
8718PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8719 Py_ssize_t size,
8720 PyObject *mapping,
8721 const char *errors)
8722{
8723 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8724 if (!unicode)
8725 return NULL;
8726 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8727}
8728
Alexander Belopolsky40018472011-02-26 01:02:56 +00008729PyObject *
8730PyUnicode_Translate(PyObject *str,
8731 PyObject *mapping,
8732 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733{
8734 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008735
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736 str = PyUnicode_FromObject(str);
8737 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740 Py_DECREF(str);
8741 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008742
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744 Py_XDECREF(str);
8745 return NULL;
8746}
Tim Petersced69f82003-09-16 20:30:58 +00008747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008749fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750{
8751 /* No need to call PyUnicode_READY(self) because this function is only
8752 called as a callback from fixup() which does it already. */
8753 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8754 const int kind = PyUnicode_KIND(self);
8755 void *data = PyUnicode_DATA(self);
8756 Py_UCS4 maxchar = 0, ch, fixed;
8757 Py_ssize_t i;
8758
8759 for (i = 0; i < len; ++i) {
8760 ch = PyUnicode_READ(kind, data, i);
8761 fixed = 0;
8762 if (ch > 127) {
8763 if (Py_UNICODE_ISSPACE(ch))
8764 fixed = ' ';
8765 else {
8766 const int decimal = Py_UNICODE_TODECIMAL(ch);
8767 if (decimal >= 0)
8768 fixed = '0' + decimal;
8769 }
8770 if (fixed != 0) {
8771 if (fixed > maxchar)
8772 maxchar = fixed;
8773 PyUnicode_WRITE(kind, data, i, fixed);
8774 }
8775 else if (ch > maxchar)
8776 maxchar = ch;
8777 }
8778 else if (ch > maxchar)
8779 maxchar = ch;
8780 }
8781
8782 return maxchar;
8783}
8784
8785PyObject *
8786_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8787{
8788 if (!PyUnicode_Check(unicode)) {
8789 PyErr_BadInternalCall();
8790 return NULL;
8791 }
8792 if (PyUnicode_READY(unicode) == -1)
8793 return NULL;
8794 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8795 /* If the string is already ASCII, just return the same string */
8796 Py_INCREF(unicode);
8797 return unicode;
8798 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008799 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800}
8801
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008802PyObject *
8803PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8804 Py_ssize_t length)
8805{
Victor Stinnerf0124502011-11-21 23:12:56 +01008806 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008807 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008808 Py_UCS4 maxchar;
8809 enum PyUnicode_Kind kind;
8810 void *data;
8811
8812 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008813 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008814 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008815 if (ch > 127) {
8816 int decimal = Py_UNICODE_TODECIMAL(ch);
8817 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008818 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008819 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008820 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008821 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008822
8823 /* Copy to a new string */
8824 decimal = PyUnicode_New(length, maxchar);
8825 if (decimal == NULL)
8826 return decimal;
8827 kind = PyUnicode_KIND(decimal);
8828 data = PyUnicode_DATA(decimal);
8829 /* Iterate over code points */
8830 for (i = 0; i < length; i++) {
8831 Py_UNICODE ch = s[i];
8832 if (ch > 127) {
8833 int decimal = Py_UNICODE_TODECIMAL(ch);
8834 if (decimal >= 0)
8835 ch = '0' + decimal;
8836 }
8837 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008838 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008839 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008840}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008841/* --- Decimal Encoder ---------------------------------------------------- */
8842
Alexander Belopolsky40018472011-02-26 01:02:56 +00008843int
8844PyUnicode_EncodeDecimal(Py_UNICODE *s,
8845 Py_ssize_t length,
8846 char *output,
8847 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008848{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008849 PyObject *errorHandler = NULL;
8850 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008851 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008852 const char *encoding = "decimal";
8853 const char *reason = "invalid decimal Unicode string";
8854 /* the following variable is used for caching string comparisons
8855 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8856 int known_errorHandler = -1;
Victor Stinner42bf7752011-11-21 22:52:58 +01008857 Py_ssize_t i, j;
8858 enum PyUnicode_Kind kind;
8859 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008860
8861 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008862 PyErr_BadArgument();
8863 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008864 }
8865
Victor Stinner42bf7752011-11-21 22:52:58 +01008866 unicode = PyUnicode_FromUnicode(s, length);
8867 if (unicode == NULL)
8868 return -1;
8869
8870 if (PyUnicode_READY(unicode) < 0)
8871 goto onError;
8872 kind = PyUnicode_KIND(unicode);
8873 data = PyUnicode_DATA(unicode);
8874
8875 for (i=0; i < length; i++) {
8876 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008877 int decimal;
Victor Stinner42bf7752011-11-21 22:52:58 +01008878 Py_ssize_t startpos, endpos;
Tim Petersced69f82003-09-16 20:30:58 +00008879
Benjamin Peterson29060642009-01-31 22:14:21 +00008880 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008881 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008883 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 decimal = Py_UNICODE_TODECIMAL(ch);
8885 if (decimal >= 0) {
8886 *output++ = '0' + decimal;
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 continue;
8888 }
8889 if (0 < ch && ch < 256) {
8890 *output++ = (char)ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 continue;
8892 }
8893 /* All other characters are considered unencodable */
Victor Stinner42bf7752011-11-21 22:52:58 +01008894 startpos = i;
8895 endpos = i+1;
8896 for (; endpos < length; endpos++) {
8897 ch = PyUnicode_READ(kind, data, endpos);
8898 if ((0 < ch && ch < 256) ||
8899 !Py_UNICODE_ISSPACE(ch) ||
8900 Py_UNICODE_TODECIMAL(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 break;
8902 }
8903 /* cache callback name lookup
8904 * (if not done yet, i.e. it's the first error) */
8905 if (known_errorHandler==-1) {
8906 if ((errors==NULL) || (!strcmp(errors, "strict")))
8907 known_errorHandler = 1;
8908 else if (!strcmp(errors, "replace"))
8909 known_errorHandler = 2;
8910 else if (!strcmp(errors, "ignore"))
8911 known_errorHandler = 3;
8912 else if (!strcmp(errors, "xmlcharrefreplace"))
8913 known_errorHandler = 4;
8914 else
8915 known_errorHandler = 0;
8916 }
8917 switch (known_errorHandler) {
8918 case 1: /* strict */
Victor Stinner42bf7752011-11-21 22:52:58 +01008919 raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 goto onError;
8921 case 2: /* replace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008922 for (j=startpos; j < endpos; j++)
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 *output++ = '?';
8924 /* fall through */
8925 case 3: /* ignore */
Victor Stinner42bf7752011-11-21 22:52:58 +01008926 i = endpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 break;
8928 case 4: /* xmlcharrefreplace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008929 /* generate replacement */
8930 for (j=startpos; j < endpos; j++) {
8931 ch = PyUnicode_READ(kind, data, i);
8932 output += sprintf(output, "&#%d;", (int)ch);
8933 i++;
8934 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 break;
8936 default:
Victor Stinner42bf7752011-11-21 22:52:58 +01008937 {
8938 PyObject *repunicode;
8939 Py_ssize_t repsize, newpos, k;
8940 enum PyUnicode_Kind repkind;
8941 void *repdata;
8942
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008944 encoding, reason, unicode, &exc,
Victor Stinner42bf7752011-11-21 22:52:58 +01008945 startpos, endpos, &newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 if (repunicode == NULL)
8947 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008948 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008949 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008950 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8951 Py_DECREF(repunicode);
8952 goto onError;
8953 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008954 if (PyUnicode_READY(repunicode) < 0) {
8955 Py_DECREF(repunicode);
8956 goto onError;
8957 }
8958 repkind = PyUnicode_KIND(repunicode);
8959 repdata = PyUnicode_DATA(repunicode);
8960
Benjamin Peterson29060642009-01-31 22:14:21 +00008961 /* generate replacement */
8962 repsize = PyUnicode_GET_SIZE(repunicode);
Victor Stinner42bf7752011-11-21 22:52:58 +01008963 for (k=0; k<repsize; k++) {
8964 ch = PyUnicode_READ(repkind, repdata, k);
Benjamin Peterson29060642009-01-31 22:14:21 +00008965 if (Py_UNICODE_ISSPACE(ch))
8966 *output++ = ' ';
8967 else {
8968 decimal = Py_UNICODE_TODECIMAL(ch);
8969 if (decimal >= 0)
8970 *output++ = '0' + decimal;
8971 else if (0 < ch && ch < 256)
8972 *output++ = (char)ch;
8973 else {
8974 Py_DECREF(repunicode);
8975 raise_encode_exception(&exc, encoding,
Victor Stinner42bf7752011-11-21 22:52:58 +01008976 unicode, startpos, endpos,
8977 reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 goto onError;
8979 }
8980 }
8981 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008982 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008983 Py_DECREF(repunicode);
8984 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008985 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008986 }
8987 /* 0-terminate the output string */
8988 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008989 Py_XDECREF(exc);
8990 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008991 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008992 return 0;
8993
Benjamin Peterson29060642009-01-31 22:14:21 +00008994 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008995 Py_XDECREF(exc);
8996 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008997 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008998 return -1;
8999}
9000
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001/* --- Helpers ------------------------------------------------------------ */
9002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009004any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 Py_ssize_t start,
9006 Py_ssize_t end)
9007{
9008 int kind1, kind2, kind;
9009 void *buf1, *buf2;
9010 Py_ssize_t len1, len2, result;
9011
9012 kind1 = PyUnicode_KIND(s1);
9013 kind2 = PyUnicode_KIND(s2);
9014 kind = kind1 > kind2 ? kind1 : kind2;
9015 buf1 = PyUnicode_DATA(s1);
9016 buf2 = PyUnicode_DATA(s2);
9017 if (kind1 != kind)
9018 buf1 = _PyUnicode_AsKind(s1, kind);
9019 if (!buf1)
9020 return -2;
9021 if (kind2 != kind)
9022 buf2 = _PyUnicode_AsKind(s2, kind);
9023 if (!buf2) {
9024 if (kind1 != kind) PyMem_Free(buf1);
9025 return -2;
9026 }
9027 len1 = PyUnicode_GET_LENGTH(s1);
9028 len2 = PyUnicode_GET_LENGTH(s2);
9029
Victor Stinner794d5672011-10-10 03:21:36 +02009030 if (direction > 0) {
9031 switch(kind) {
9032 case PyUnicode_1BYTE_KIND:
9033 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9034 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9035 else
9036 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9037 break;
9038 case PyUnicode_2BYTE_KIND:
9039 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9040 break;
9041 case PyUnicode_4BYTE_KIND:
9042 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9043 break;
9044 default:
9045 assert(0); result = -2;
9046 }
9047 }
9048 else {
9049 switch(kind) {
9050 case PyUnicode_1BYTE_KIND:
9051 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9052 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9053 else
9054 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9055 break;
9056 case PyUnicode_2BYTE_KIND:
9057 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9058 break;
9059 case PyUnicode_4BYTE_KIND:
9060 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9061 break;
9062 default:
9063 assert(0); result = -2;
9064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065 }
9066
9067 if (kind1 != kind)
9068 PyMem_Free(buf1);
9069 if (kind2 != kind)
9070 PyMem_Free(buf2);
9071
9072 return result;
9073}
9074
9075Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009076_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 Py_ssize_t n_buffer,
9078 void *digits, Py_ssize_t n_digits,
9079 Py_ssize_t min_width,
9080 const char *grouping,
9081 const char *thousands_sep)
9082{
9083 switch(kind) {
9084 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009085 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9086 return _PyUnicode_ascii_InsertThousandsGrouping(
9087 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9088 min_width, grouping, thousands_sep);
9089 else
9090 return _PyUnicode_ucs1_InsertThousandsGrouping(
9091 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9092 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 case PyUnicode_2BYTE_KIND:
9094 return _PyUnicode_ucs2_InsertThousandsGrouping(
9095 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9096 min_width, grouping, thousands_sep);
9097 case PyUnicode_4BYTE_KIND:
9098 return _PyUnicode_ucs4_InsertThousandsGrouping(
9099 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9100 min_width, grouping, thousands_sep);
9101 }
9102 assert(0);
9103 return -1;
9104}
9105
9106
Thomas Wouters477c8d52006-05-27 19:21:47 +00009107/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009108#define ADJUST_INDICES(start, end, len) \
9109 if (end > len) \
9110 end = len; \
9111 else if (end < 0) { \
9112 end += len; \
9113 if (end < 0) \
9114 end = 0; \
9115 } \
9116 if (start < 0) { \
9117 start += len; \
9118 if (start < 0) \
9119 start = 0; \
9120 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009121
Alexander Belopolsky40018472011-02-26 01:02:56 +00009122Py_ssize_t
9123PyUnicode_Count(PyObject *str,
9124 PyObject *substr,
9125 Py_ssize_t start,
9126 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009128 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009129 PyObject* str_obj;
9130 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 int kind1, kind2, kind;
9132 void *buf1 = NULL, *buf2 = NULL;
9133 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009134
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009135 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009138 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009139 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 Py_DECREF(str_obj);
9141 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009142 }
Tim Petersced69f82003-09-16 20:30:58 +00009143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 kind1 = PyUnicode_KIND(str_obj);
9145 kind2 = PyUnicode_KIND(sub_obj);
9146 kind = kind1 > kind2 ? kind1 : kind2;
9147 buf1 = PyUnicode_DATA(str_obj);
9148 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009149 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 if (!buf1)
9151 goto onError;
9152 buf2 = PyUnicode_DATA(sub_obj);
9153 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009154 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155 if (!buf2)
9156 goto onError;
9157 len1 = PyUnicode_GET_LENGTH(str_obj);
9158 len2 = PyUnicode_GET_LENGTH(sub_obj);
9159
9160 ADJUST_INDICES(start, end, len1);
9161 switch(kind) {
9162 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009163 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9164 result = asciilib_count(
9165 ((Py_UCS1*)buf1) + start, end - start,
9166 buf2, len2, PY_SSIZE_T_MAX
9167 );
9168 else
9169 result = ucs1lib_count(
9170 ((Py_UCS1*)buf1) + start, end - start,
9171 buf2, len2, PY_SSIZE_T_MAX
9172 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009173 break;
9174 case PyUnicode_2BYTE_KIND:
9175 result = ucs2lib_count(
9176 ((Py_UCS2*)buf1) + start, end - start,
9177 buf2, len2, PY_SSIZE_T_MAX
9178 );
9179 break;
9180 case PyUnicode_4BYTE_KIND:
9181 result = ucs4lib_count(
9182 ((Py_UCS4*)buf1) + start, end - start,
9183 buf2, len2, PY_SSIZE_T_MAX
9184 );
9185 break;
9186 default:
9187 assert(0); result = 0;
9188 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009189
9190 Py_DECREF(sub_obj);
9191 Py_DECREF(str_obj);
9192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 if (kind1 != kind)
9194 PyMem_Free(buf1);
9195 if (kind2 != kind)
9196 PyMem_Free(buf2);
9197
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 onError:
9200 Py_DECREF(sub_obj);
9201 Py_DECREF(str_obj);
9202 if (kind1 != kind && buf1)
9203 PyMem_Free(buf1);
9204 if (kind2 != kind && buf2)
9205 PyMem_Free(buf2);
9206 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009207}
9208
Alexander Belopolsky40018472011-02-26 01:02:56 +00009209Py_ssize_t
9210PyUnicode_Find(PyObject *str,
9211 PyObject *sub,
9212 Py_ssize_t start,
9213 Py_ssize_t end,
9214 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009216 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009217
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009219 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009221 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009223 Py_DECREF(str);
9224 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 }
Tim Petersced69f82003-09-16 20:30:58 +00009226
Victor Stinner794d5672011-10-10 03:21:36 +02009227 result = any_find_slice(direction,
9228 str, sub, start, end
9229 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009230
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009232 Py_DECREF(sub);
9233
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234 return result;
9235}
9236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237Py_ssize_t
9238PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9239 Py_ssize_t start, Py_ssize_t end,
9240 int direction)
9241{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009243 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 if (PyUnicode_READY(str) == -1)
9245 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009246 if (start < 0 || end < 0) {
9247 PyErr_SetString(PyExc_IndexError, "string index out of range");
9248 return -2;
9249 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 if (end > PyUnicode_GET_LENGTH(str))
9251 end = PyUnicode_GET_LENGTH(str);
9252 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009253 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9254 kind, end-start, ch, direction);
9255 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009257 else
9258 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009259}
9260
Alexander Belopolsky40018472011-02-26 01:02:56 +00009261static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009262tailmatch(PyObject *self,
9263 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009264 Py_ssize_t start,
9265 Py_ssize_t end,
9266 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009267{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268 int kind_self;
9269 int kind_sub;
9270 void *data_self;
9271 void *data_sub;
9272 Py_ssize_t offset;
9273 Py_ssize_t i;
9274 Py_ssize_t end_sub;
9275
9276 if (PyUnicode_READY(self) == -1 ||
9277 PyUnicode_READY(substring) == -1)
9278 return 0;
9279
9280 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281 return 1;
9282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9284 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009286 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 kind_self = PyUnicode_KIND(self);
9289 data_self = PyUnicode_DATA(self);
9290 kind_sub = PyUnicode_KIND(substring);
9291 data_sub = PyUnicode_DATA(substring);
9292 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9293
9294 if (direction > 0)
9295 offset = end;
9296 else
9297 offset = start;
9298
9299 if (PyUnicode_READ(kind_self, data_self, offset) ==
9300 PyUnicode_READ(kind_sub, data_sub, 0) &&
9301 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9302 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9303 /* If both are of the same kind, memcmp is sufficient */
9304 if (kind_self == kind_sub) {
9305 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009306 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 data_sub,
9308 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009309 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 }
9311 /* otherwise we have to compare each character by first accesing it */
9312 else {
9313 /* We do not need to compare 0 and len(substring)-1 because
9314 the if statement above ensured already that they are equal
9315 when we end up here. */
9316 // TODO: honor direction and do a forward or backwards search
9317 for (i = 1; i < end_sub; ++i) {
9318 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9319 PyUnicode_READ(kind_sub, data_sub, i))
9320 return 0;
9321 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009322 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324 }
9325
9326 return 0;
9327}
9328
Alexander Belopolsky40018472011-02-26 01:02:56 +00009329Py_ssize_t
9330PyUnicode_Tailmatch(PyObject *str,
9331 PyObject *substr,
9332 Py_ssize_t start,
9333 Py_ssize_t end,
9334 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009336 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009337
Guido van Rossumd57fd912000-03-10 22:53:23 +00009338 str = PyUnicode_FromObject(str);
9339 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009340 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341 substr = PyUnicode_FromObject(substr);
9342 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 Py_DECREF(str);
9344 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345 }
Tim Petersced69f82003-09-16 20:30:58 +00009346
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009347 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009348 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 Py_DECREF(str);
9350 Py_DECREF(substr);
9351 return result;
9352}
9353
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354/* Apply fixfct filter to the Unicode object self and return a
9355 reference to the modified object */
9356
Alexander Belopolsky40018472011-02-26 01:02:56 +00009357static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009358fixup(PyObject *self,
9359 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009360{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 PyObject *u;
9362 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363
Victor Stinner87af4f22011-11-21 23:03:47 +01009364 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009366 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009367 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 /* fix functions return the new maximum character in a string,
9370 if the kind of the resulting unicode object does not change,
9371 everything is fine. Otherwise we need to change the string kind
9372 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009373 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 if (maxchar_new == 0)
9375 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9376 else if (maxchar_new <= 127)
9377 maxchar_new = 127;
9378 else if (maxchar_new <= 255)
9379 maxchar_new = 255;
9380 else if (maxchar_new <= 65535)
9381 maxchar_new = 65535;
9382 else
9383 maxchar_new = 1114111; /* 0x10ffff */
9384
9385 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009386 /* fixfct should return TRUE if it modified the buffer. If
9387 FALSE, return a reference to the original buffer instead
9388 (to save space, not time) */
9389 Py_INCREF(self);
9390 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009391 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009392 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009393 else if (maxchar_new == maxchar_old) {
9394 return u;
9395 }
9396 else {
9397 /* In case the maximum character changed, we need to
9398 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009399 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 if (v == NULL) {
9401 Py_DECREF(u);
9402 return NULL;
9403 }
9404 if (maxchar_new > maxchar_old) {
9405 /* If the maxchar increased so that the kind changed, not all
9406 characters are representable anymore and we need to fix the
9407 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009408 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009409 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9411 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009412 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009413 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009414 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415
9416 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009417 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 return v;
9419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009420}
9421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009422static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009423fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009424{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 /* No need to call PyUnicode_READY(self) because this function is only
9426 called as a callback from fixup() which does it already. */
9427 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9428 const int kind = PyUnicode_KIND(self);
9429 void *data = PyUnicode_DATA(self);
9430 int touched = 0;
9431 Py_UCS4 maxchar = 0;
9432 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 for (i = 0; i < len; ++i) {
9435 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9436 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9437 if (up != ch) {
9438 if (up > maxchar)
9439 maxchar = up;
9440 PyUnicode_WRITE(kind, data, i, up);
9441 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 else if (ch > maxchar)
9444 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445 }
9446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 if (touched)
9448 return maxchar;
9449 else
9450 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451}
9452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009454fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9457 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9458 const int kind = PyUnicode_KIND(self);
9459 void *data = PyUnicode_DATA(self);
9460 int touched = 0;
9461 Py_UCS4 maxchar = 0;
9462 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 for(i = 0; i < len; ++i) {
9465 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9466 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9467 if (lo != ch) {
9468 if (lo > maxchar)
9469 maxchar = lo;
9470 PyUnicode_WRITE(kind, data, i, lo);
9471 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 else if (ch > maxchar)
9474 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475 }
9476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 if (touched)
9478 return maxchar;
9479 else
9480 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481}
9482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009484fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009485{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9487 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9488 const int kind = PyUnicode_KIND(self);
9489 void *data = PyUnicode_DATA(self);
9490 int touched = 0;
9491 Py_UCS4 maxchar = 0;
9492 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 for(i = 0; i < len; ++i) {
9495 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9496 Py_UCS4 nu = 0;
9497
9498 if (Py_UNICODE_ISUPPER(ch))
9499 nu = Py_UNICODE_TOLOWER(ch);
9500 else if (Py_UNICODE_ISLOWER(ch))
9501 nu = Py_UNICODE_TOUPPER(ch);
9502
9503 if (nu != 0) {
9504 if (nu > maxchar)
9505 maxchar = nu;
9506 PyUnicode_WRITE(kind, data, i, nu);
9507 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 else if (ch > maxchar)
9510 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511 }
9512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 if (touched)
9514 return maxchar;
9515 else
9516 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517}
9518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009520fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9523 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9524 const int kind = PyUnicode_KIND(self);
9525 void *data = PyUnicode_DATA(self);
9526 int touched = 0;
9527 Py_UCS4 maxchar = 0;
9528 Py_ssize_t i = 0;
9529 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009530
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009531 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533
9534 ch = PyUnicode_READ(kind, data, i);
9535 if (!Py_UNICODE_ISUPPER(ch)) {
9536 maxchar = Py_UNICODE_TOUPPER(ch);
9537 PyUnicode_WRITE(kind, data, i, maxchar);
9538 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 ++i;
9541 for(; i < len; ++i) {
9542 ch = PyUnicode_READ(kind, data, i);
9543 if (!Py_UNICODE_ISLOWER(ch)) {
9544 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9545 if (lo > maxchar)
9546 maxchar = lo;
9547 PyUnicode_WRITE(kind, data, i, lo);
9548 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 else if (ch > maxchar)
9551 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553
9554 if (touched)
9555 return maxchar;
9556 else
9557 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558}
9559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009561fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9564 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9565 const int kind = PyUnicode_KIND(self);
9566 void *data = PyUnicode_DATA(self);
9567 Py_UCS4 maxchar = 0;
9568 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 int previous_is_cased;
9570
9571 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 if (len == 1) {
9573 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9574 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9575 if (ti != ch) {
9576 PyUnicode_WRITE(kind, data, i, ti);
9577 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009578 }
9579 else
9580 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 for(; i < len; ++i) {
9584 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9585 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009586
Benjamin Peterson29060642009-01-31 22:14:21 +00009587 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009589 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 nu = Py_UNICODE_TOTITLE(ch);
9591
9592 if (nu > maxchar)
9593 maxchar = nu;
9594 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009595
Benjamin Peterson29060642009-01-31 22:14:21 +00009596 if (Py_UNICODE_ISLOWER(ch) ||
9597 Py_UNICODE_ISUPPER(ch) ||
9598 Py_UNICODE_ISTITLE(ch))
9599 previous_is_cased = 1;
9600 else
9601 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604}
9605
Tim Peters8ce9f162004-08-27 01:49:32 +00009606PyObject *
9607PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009610 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009612 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009613 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9614 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009615 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009617 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009619 int use_memcpy;
9620 unsigned char *res_data = NULL, *sep_data = NULL;
9621 PyObject *last_obj;
9622 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623
Tim Peters05eba1f2004-08-27 21:32:02 +00009624 fseq = PySequence_Fast(seq, "");
9625 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009626 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009627 }
9628
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009629 /* NOTE: the following code can't call back into Python code,
9630 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009631 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009632
Tim Peters05eba1f2004-08-27 21:32:02 +00009633 seqlen = PySequence_Fast_GET_SIZE(fseq);
9634 /* If empty sequence, return u"". */
9635 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009636 Py_DECREF(fseq);
9637 Py_INCREF(unicode_empty);
9638 res = unicode_empty;
9639 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009640 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009641
Tim Peters05eba1f2004-08-27 21:32:02 +00009642 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009643 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009644 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009645 if (seqlen == 1) {
9646 if (PyUnicode_CheckExact(items[0])) {
9647 res = items[0];
9648 Py_INCREF(res);
9649 Py_DECREF(fseq);
9650 return res;
9651 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009652 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009653 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009654 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009655 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009656 /* Set up sep and seplen */
9657 if (separator == NULL) {
9658 /* fall back to a blank space separator */
9659 sep = PyUnicode_FromOrdinal(' ');
9660 if (!sep)
9661 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009662 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009663 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009664 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009665 else {
9666 if (!PyUnicode_Check(separator)) {
9667 PyErr_Format(PyExc_TypeError,
9668 "separator: expected str instance,"
9669 " %.80s found",
9670 Py_TYPE(separator)->tp_name);
9671 goto onError;
9672 }
9673 if (PyUnicode_READY(separator))
9674 goto onError;
9675 sep = separator;
9676 seplen = PyUnicode_GET_LENGTH(separator);
9677 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9678 /* inc refcount to keep this code path symmetric with the
9679 above case of a blank separator */
9680 Py_INCREF(sep);
9681 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009682 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009683 }
9684
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009685 /* There are at least two things to join, or else we have a subclass
9686 * of str in the sequence.
9687 * Do a pre-pass to figure out the total amount of space we'll
9688 * need (sz), and see whether all argument are strings.
9689 */
9690 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009691#ifdef Py_DEBUG
9692 use_memcpy = 0;
9693#else
9694 use_memcpy = 1;
9695#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009696 for (i = 0; i < seqlen; i++) {
9697 const Py_ssize_t old_sz = sz;
9698 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009699 if (!PyUnicode_Check(item)) {
9700 PyErr_Format(PyExc_TypeError,
9701 "sequence item %zd: expected str instance,"
9702 " %.80s found",
9703 i, Py_TYPE(item)->tp_name);
9704 goto onError;
9705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 if (PyUnicode_READY(item) == -1)
9707 goto onError;
9708 sz += PyUnicode_GET_LENGTH(item);
9709 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009710 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009711 if (i != 0)
9712 sz += seplen;
9713 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9714 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009715 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009716 goto onError;
9717 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009718 if (use_memcpy && last_obj != NULL) {
9719 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9720 use_memcpy = 0;
9721 }
9722 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009723 }
Tim Petersced69f82003-09-16 20:30:58 +00009724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009726 if (res == NULL)
9727 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009728
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009729 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009730#ifdef Py_DEBUG
9731 use_memcpy = 0;
9732#else
9733 if (use_memcpy) {
9734 res_data = PyUnicode_1BYTE_DATA(res);
9735 kind = PyUnicode_KIND(res);
9736 if (seplen != 0)
9737 sep_data = PyUnicode_1BYTE_DATA(sep);
9738 }
9739#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009741 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009742 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009743 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009744 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009745 if (use_memcpy) {
9746 Py_MEMCPY(res_data,
9747 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009748 kind * seplen);
9749 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009750 }
9751 else {
9752 copy_characters(res, res_offset, sep, 0, seplen);
9753 res_offset += seplen;
9754 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009755 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009756 itemlen = PyUnicode_GET_LENGTH(item);
9757 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009758 if (use_memcpy) {
9759 Py_MEMCPY(res_data,
9760 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009761 kind * itemlen);
9762 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009763 }
9764 else {
9765 copy_characters(res, res_offset, item, 0, itemlen);
9766 res_offset += itemlen;
9767 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009768 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009769 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009770 if (use_memcpy)
9771 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009772 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009773 else
9774 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009775
Tim Peters05eba1f2004-08-27 21:32:02 +00009776 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009778 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780
Benjamin Peterson29060642009-01-31 22:14:21 +00009781 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009782 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009784 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785 return NULL;
9786}
9787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788#define FILL(kind, data, value, start, length) \
9789 do { \
9790 Py_ssize_t i_ = 0; \
9791 assert(kind != PyUnicode_WCHAR_KIND); \
9792 switch ((kind)) { \
9793 case PyUnicode_1BYTE_KIND: { \
9794 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9795 memset(to_, (unsigned char)value, length); \
9796 break; \
9797 } \
9798 case PyUnicode_2BYTE_KIND: { \
9799 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9800 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9801 break; \
9802 } \
9803 default: { \
9804 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9805 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9806 break; \
9807 } \
9808 } \
9809 } while (0)
9810
Victor Stinner9310abb2011-10-05 00:59:23 +02009811static PyObject *
9812pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009813 Py_ssize_t left,
9814 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009816{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 PyObject *u;
9818 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009819 int kind;
9820 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821
9822 if (left < 0)
9823 left = 0;
9824 if (right < 0)
9825 right = 0;
9826
Tim Peters7a29bd52001-09-12 03:03:31 +00009827 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009828 Py_INCREF(self);
9829 return self;
9830 }
9831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9833 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009834 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9835 return NULL;
9836 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9838 if (fill > maxchar)
9839 maxchar = fill;
9840 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009841 if (!u)
9842 return NULL;
9843
9844 kind = PyUnicode_KIND(u);
9845 data = PyUnicode_DATA(u);
9846 if (left)
9847 FILL(kind, data, fill, 0, left);
9848 if (right)
9849 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009850 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009851 assert(_PyUnicode_CheckConsistency(u, 1));
9852 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009853}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855
Alexander Belopolsky40018472011-02-26 01:02:56 +00009856PyObject *
9857PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860
9861 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009863 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 switch(PyUnicode_KIND(string)) {
9866 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009867 if (PyUnicode_IS_ASCII(string))
9868 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009869 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009870 PyUnicode_GET_LENGTH(string), keepends);
9871 else
9872 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009873 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009874 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 break;
9876 case PyUnicode_2BYTE_KIND:
9877 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009878 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 PyUnicode_GET_LENGTH(string), keepends);
9880 break;
9881 case PyUnicode_4BYTE_KIND:
9882 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009883 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 PyUnicode_GET_LENGTH(string), keepends);
9885 break;
9886 default:
9887 assert(0);
9888 list = 0;
9889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009890 Py_DECREF(string);
9891 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892}
9893
Alexander Belopolsky40018472011-02-26 01:02:56 +00009894static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009895split(PyObject *self,
9896 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009897 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 int kind1, kind2, kind;
9900 void *buf1, *buf2;
9901 Py_ssize_t len1, len2;
9902 PyObject* out;
9903
Guido van Rossumd57fd912000-03-10 22:53:23 +00009904 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009905 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 if (PyUnicode_READY(self) == -1)
9908 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 if (substring == NULL)
9911 switch(PyUnicode_KIND(self)) {
9912 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009913 if (PyUnicode_IS_ASCII(self))
9914 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009915 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009916 PyUnicode_GET_LENGTH(self), maxcount
9917 );
9918 else
9919 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009920 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009921 PyUnicode_GET_LENGTH(self), maxcount
9922 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 case PyUnicode_2BYTE_KIND:
9924 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009925 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 PyUnicode_GET_LENGTH(self), maxcount
9927 );
9928 case PyUnicode_4BYTE_KIND:
9929 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009930 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 PyUnicode_GET_LENGTH(self), maxcount
9932 );
9933 default:
9934 assert(0);
9935 return NULL;
9936 }
9937
9938 if (PyUnicode_READY(substring) == -1)
9939 return NULL;
9940
9941 kind1 = PyUnicode_KIND(self);
9942 kind2 = PyUnicode_KIND(substring);
9943 kind = kind1 > kind2 ? kind1 : kind2;
9944 buf1 = PyUnicode_DATA(self);
9945 buf2 = PyUnicode_DATA(substring);
9946 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009947 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009948 if (!buf1)
9949 return NULL;
9950 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009951 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 if (!buf2) {
9953 if (kind1 != kind) PyMem_Free(buf1);
9954 return NULL;
9955 }
9956 len1 = PyUnicode_GET_LENGTH(self);
9957 len2 = PyUnicode_GET_LENGTH(substring);
9958
9959 switch(kind) {
9960 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009961 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9962 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009963 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009964 else
9965 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009966 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 break;
9968 case PyUnicode_2BYTE_KIND:
9969 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009970 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 break;
9972 case PyUnicode_4BYTE_KIND:
9973 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009974 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 break;
9976 default:
9977 out = NULL;
9978 }
9979 if (kind1 != kind)
9980 PyMem_Free(buf1);
9981 if (kind2 != kind)
9982 PyMem_Free(buf2);
9983 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984}
9985
Alexander Belopolsky40018472011-02-26 01:02:56 +00009986static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009987rsplit(PyObject *self,
9988 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009989 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 int kind1, kind2, kind;
9992 void *buf1, *buf2;
9993 Py_ssize_t len1, len2;
9994 PyObject* out;
9995
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009996 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009997 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 if (PyUnicode_READY(self) == -1)
10000 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 if (substring == NULL)
10003 switch(PyUnicode_KIND(self)) {
10004 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010005 if (PyUnicode_IS_ASCII(self))
10006 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010007 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010008 PyUnicode_GET_LENGTH(self), maxcount
10009 );
10010 else
10011 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010012 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010013 PyUnicode_GET_LENGTH(self), maxcount
10014 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 case PyUnicode_2BYTE_KIND:
10016 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010017 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 PyUnicode_GET_LENGTH(self), maxcount
10019 );
10020 case PyUnicode_4BYTE_KIND:
10021 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010022 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 PyUnicode_GET_LENGTH(self), maxcount
10024 );
10025 default:
10026 assert(0);
10027 return NULL;
10028 }
10029
10030 if (PyUnicode_READY(substring) == -1)
10031 return NULL;
10032
10033 kind1 = PyUnicode_KIND(self);
10034 kind2 = PyUnicode_KIND(substring);
10035 kind = kind1 > kind2 ? kind1 : kind2;
10036 buf1 = PyUnicode_DATA(self);
10037 buf2 = PyUnicode_DATA(substring);
10038 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010039 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 if (!buf1)
10041 return NULL;
10042 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010043 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 if (!buf2) {
10045 if (kind1 != kind) PyMem_Free(buf1);
10046 return NULL;
10047 }
10048 len1 = PyUnicode_GET_LENGTH(self);
10049 len2 = PyUnicode_GET_LENGTH(substring);
10050
10051 switch(kind) {
10052 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010053 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10054 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010055 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010056 else
10057 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010058 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 break;
10060 case PyUnicode_2BYTE_KIND:
10061 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010062 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 break;
10064 case PyUnicode_4BYTE_KIND:
10065 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010066 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 break;
10068 default:
10069 out = NULL;
10070 }
10071 if (kind1 != kind)
10072 PyMem_Free(buf1);
10073 if (kind2 != kind)
10074 PyMem_Free(buf2);
10075 return out;
10076}
10077
10078static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010079anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10080 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081{
10082 switch(kind) {
10083 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010084 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10085 return asciilib_find(buf1, len1, buf2, len2, offset);
10086 else
10087 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 case PyUnicode_2BYTE_KIND:
10089 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10090 case PyUnicode_4BYTE_KIND:
10091 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10092 }
10093 assert(0);
10094 return -1;
10095}
10096
10097static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010098anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10099 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100{
10101 switch(kind) {
10102 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010103 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10104 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10105 else
10106 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 case PyUnicode_2BYTE_KIND:
10108 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10109 case PyUnicode_4BYTE_KIND:
10110 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10111 }
10112 assert(0);
10113 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010114}
10115
Alexander Belopolsky40018472011-02-26 01:02:56 +000010116static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117replace(PyObject *self, PyObject *str1,
10118 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 PyObject *u;
10121 char *sbuf = PyUnicode_DATA(self);
10122 char *buf1 = PyUnicode_DATA(str1);
10123 char *buf2 = PyUnicode_DATA(str2);
10124 int srelease = 0, release1 = 0, release2 = 0;
10125 int skind = PyUnicode_KIND(self);
10126 int kind1 = PyUnicode_KIND(str1);
10127 int kind2 = PyUnicode_KIND(str2);
10128 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10129 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10130 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010131 int mayshrink;
10132 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010133
10134 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010135 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010137 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138
Victor Stinner59de0ee2011-10-07 10:01:28 +020010139 if (str1 == str2)
10140 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 if (skind < kind1)
10142 /* substring too wide to be present */
10143 goto nothing;
10144
Victor Stinner49a0a212011-10-12 23:46:10 +020010145 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10146 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10147 /* Replacing str1 with str2 may cause a maxchar reduction in the
10148 result string. */
10149 mayshrink = (maxchar_str2 < maxchar);
10150 maxchar = Py_MAX(maxchar, maxchar_str2);
10151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010153 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010154 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010156 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010158 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010159 Py_UCS4 u1, u2;
10160 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010162 if (findchar(sbuf, PyUnicode_KIND(self),
10163 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010164 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010167 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010169 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 rkind = PyUnicode_KIND(u);
10171 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10172 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010173 if (--maxcount < 0)
10174 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010176 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010177 }
10178 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 int rkind = skind;
10180 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (kind1 < rkind) {
10183 /* widen substring */
10184 buf1 = _PyUnicode_AsKind(str1, rkind);
10185 if (!buf1) goto error;
10186 release1 = 1;
10187 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010188 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010189 if (i < 0)
10190 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 if (rkind > kind2) {
10192 /* widen replacement */
10193 buf2 = _PyUnicode_AsKind(str2, rkind);
10194 if (!buf2) goto error;
10195 release2 = 1;
10196 }
10197 else if (rkind < kind2) {
10198 /* widen self and buf1 */
10199 rkind = kind2;
10200 if (release1) PyMem_Free(buf1);
10201 sbuf = _PyUnicode_AsKind(self, rkind);
10202 if (!sbuf) goto error;
10203 srelease = 1;
10204 buf1 = _PyUnicode_AsKind(str1, rkind);
10205 if (!buf1) goto error;
10206 release1 = 1;
10207 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010208 u = PyUnicode_New(slen, maxchar);
10209 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010211 assert(PyUnicode_KIND(u) == rkind);
10212 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010213
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010214 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010215 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010216 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010218 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010220
10221 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010222 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010223 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010225 if (i == -1)
10226 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010227 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010229 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010233 }
10234 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 Py_ssize_t n, i, j, ires;
10236 Py_ssize_t product, new_size;
10237 int rkind = skind;
10238 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010241 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 buf1 = _PyUnicode_AsKind(str1, rkind);
10243 if (!buf1) goto error;
10244 release1 = 1;
10245 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010246 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010247 if (n == 0)
10248 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010250 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 buf2 = _PyUnicode_AsKind(str2, rkind);
10252 if (!buf2) goto error;
10253 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010256 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 rkind = kind2;
10258 sbuf = _PyUnicode_AsKind(self, rkind);
10259 if (!sbuf) goto error;
10260 srelease = 1;
10261 if (release1) PyMem_Free(buf1);
10262 buf1 = _PyUnicode_AsKind(str1, rkind);
10263 if (!buf1) goto error;
10264 release1 = 1;
10265 }
10266 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10267 PyUnicode_GET_LENGTH(str1))); */
10268 product = n * (len2-len1);
10269 if ((product / (len2-len1)) != n) {
10270 PyErr_SetString(PyExc_OverflowError,
10271 "replace string is too long");
10272 goto error;
10273 }
10274 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010275 if (new_size == 0) {
10276 Py_INCREF(unicode_empty);
10277 u = unicode_empty;
10278 goto done;
10279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10281 PyErr_SetString(PyExc_OverflowError,
10282 "replace string is too long");
10283 goto error;
10284 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010285 u = PyUnicode_New(new_size, maxchar);
10286 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010288 assert(PyUnicode_KIND(u) == rkind);
10289 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 ires = i = 0;
10291 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010292 while (n-- > 0) {
10293 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010294 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010295 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010296 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010297 if (j == -1)
10298 break;
10299 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010300 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010301 memcpy(res + rkind * ires,
10302 sbuf + rkind * i,
10303 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010305 }
10306 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010308 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010310 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010316 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010317 memcpy(res + rkind * ires,
10318 sbuf + rkind * i,
10319 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010320 }
10321 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010322 /* interleave */
10323 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010324 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010326 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010328 if (--n <= 0)
10329 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010330 memcpy(res + rkind * ires,
10331 sbuf + rkind * i,
10332 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 ires++;
10334 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010335 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010336 memcpy(res + rkind * ires,
10337 sbuf + rkind * i,
10338 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010339 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010340 }
10341
10342 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010343 unicode_adjust_maxchar(&u);
10344 if (u == NULL)
10345 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010346 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010347
10348 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 if (srelease)
10350 PyMem_FREE(sbuf);
10351 if (release1)
10352 PyMem_FREE(buf1);
10353 if (release2)
10354 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010355 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010357
Benjamin Peterson29060642009-01-31 22:14:21 +000010358 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010359 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (srelease)
10361 PyMem_FREE(sbuf);
10362 if (release1)
10363 PyMem_FREE(buf1);
10364 if (release2)
10365 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010366 if (PyUnicode_CheckExact(self)) {
10367 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010368 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010369 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010370 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 error:
10372 if (srelease && sbuf)
10373 PyMem_FREE(sbuf);
10374 if (release1 && buf1)
10375 PyMem_FREE(buf1);
10376 if (release2 && buf2)
10377 PyMem_FREE(buf2);
10378 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379}
10380
10381/* --- Unicode Object Methods --------------------------------------------- */
10382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010383PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010384 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385\n\
10386Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010387characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
10389static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010390unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392 return fixup(self, fixtitle);
10393}
10394
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010395PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010396 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397\n\
10398Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010399have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400
10401static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010402unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404 return fixup(self, fixcapitalize);
10405}
10406
10407#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010408PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010409 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410\n\
10411Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010412normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413
10414static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010415unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010416{
10417 PyObject *list;
10418 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010419 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010420
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421 /* Split into words */
10422 list = split(self, NULL, -1);
10423 if (!list)
10424 return NULL;
10425
10426 /* Capitalize each word */
10427 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010428 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010429 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430 if (item == NULL)
10431 goto onError;
10432 Py_DECREF(PyList_GET_ITEM(list, i));
10433 PyList_SET_ITEM(list, i, item);
10434 }
10435
10436 /* Join the words to form a new string */
10437 item = PyUnicode_Join(NULL, list);
10438
Benjamin Peterson29060642009-01-31 22:14:21 +000010439 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010441 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442}
10443#endif
10444
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010445/* Argument converter. Coerces to a single unicode character */
10446
10447static int
10448convert_uc(PyObject *obj, void *addr)
10449{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010451 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010452
Benjamin Peterson14339b62009-01-31 16:36:08 +000010453 uniobj = PyUnicode_FromObject(obj);
10454 if (uniobj == NULL) {
10455 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010456 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010457 return 0;
10458 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010460 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010462 Py_DECREF(uniobj);
10463 return 0;
10464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010466 Py_DECREF(uniobj);
10467 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010468}
10469
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010470PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010471 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010473Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010474done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475
10476static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010477unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010479 Py_ssize_t marg, left;
10480 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 Py_UCS4 fillchar = ' ';
10482
Victor Stinnere9a29352011-10-01 02:14:59 +020010483 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485
Victor Stinnere9a29352011-10-01 02:14:59 +020010486 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487 return NULL;
10488
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010491 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492 }
10493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495 left = marg / 2 + (marg & width & 1);
10496
Victor Stinner9310abb2011-10-05 00:59:23 +020010497 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498}
10499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500/* This function assumes that str1 and str2 are readied by the caller. */
10501
Marc-André Lemburge5034372000-08-08 08:04:29 +000010502static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010503unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 int kind1, kind2;
10506 void *data1, *data2;
10507 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 kind1 = PyUnicode_KIND(str1);
10510 kind2 = PyUnicode_KIND(str2);
10511 data1 = PyUnicode_DATA(str1);
10512 data2 = PyUnicode_DATA(str2);
10513 len1 = PyUnicode_GET_LENGTH(str1);
10514 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 for (i = 0; i < len1 && i < len2; ++i) {
10517 Py_UCS4 c1, c2;
10518 c1 = PyUnicode_READ(kind1, data1, i);
10519 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010520
10521 if (c1 != c2)
10522 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010523 }
10524
10525 return (len1 < len2) ? -1 : (len1 != len2);
10526}
10527
Alexander Belopolsky40018472011-02-26 01:02:56 +000010528int
10529PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10532 if (PyUnicode_READY(left) == -1 ||
10533 PyUnicode_READY(right) == -1)
10534 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010535 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010537 PyErr_Format(PyExc_TypeError,
10538 "Can't compare %.100s and %.100s",
10539 left->ob_type->tp_name,
10540 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541 return -1;
10542}
10543
Martin v. Löwis5b222132007-06-10 09:51:05 +000010544int
10545PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10546{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 Py_ssize_t i;
10548 int kind;
10549 void *data;
10550 Py_UCS4 chr;
10551
Victor Stinner910337b2011-10-03 03:20:16 +020010552 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 if (PyUnicode_READY(uni) == -1)
10554 return -1;
10555 kind = PyUnicode_KIND(uni);
10556 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010557 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10559 if (chr != str[i])
10560 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010561 /* This check keeps Python strings that end in '\0' from comparing equal
10562 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010564 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010565 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010566 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010567 return 0;
10568}
10569
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010570
Benjamin Peterson29060642009-01-31 22:14:21 +000010571#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010572 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010573
Alexander Belopolsky40018472011-02-26 01:02:56 +000010574PyObject *
10575PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010576{
10577 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010578
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010579 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10580 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 if (PyUnicode_READY(left) == -1 ||
10582 PyUnicode_READY(right) == -1)
10583 return NULL;
10584 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10585 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010586 if (op == Py_EQ) {
10587 Py_INCREF(Py_False);
10588 return Py_False;
10589 }
10590 if (op == Py_NE) {
10591 Py_INCREF(Py_True);
10592 return Py_True;
10593 }
10594 }
10595 if (left == right)
10596 result = 0;
10597 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010598 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010599
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010600 /* Convert the return value to a Boolean */
10601 switch (op) {
10602 case Py_EQ:
10603 v = TEST_COND(result == 0);
10604 break;
10605 case Py_NE:
10606 v = TEST_COND(result != 0);
10607 break;
10608 case Py_LE:
10609 v = TEST_COND(result <= 0);
10610 break;
10611 case Py_GE:
10612 v = TEST_COND(result >= 0);
10613 break;
10614 case Py_LT:
10615 v = TEST_COND(result == -1);
10616 break;
10617 case Py_GT:
10618 v = TEST_COND(result == 1);
10619 break;
10620 default:
10621 PyErr_BadArgument();
10622 return NULL;
10623 }
10624 Py_INCREF(v);
10625 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010626 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010627
Brian Curtindfc80e32011-08-10 20:28:54 -050010628 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010629}
10630
Alexander Belopolsky40018472011-02-26 01:02:56 +000010631int
10632PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010633{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010634 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 int kind1, kind2, kind;
10636 void *buf1, *buf2;
10637 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010638 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010639
10640 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010641 sub = PyUnicode_FromObject(element);
10642 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010643 PyErr_Format(PyExc_TypeError,
10644 "'in <string>' requires string as left operand, not %s",
10645 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010647 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 if (PyUnicode_READY(sub) == -1)
10649 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010650
Thomas Wouters477c8d52006-05-27 19:21:47 +000010651 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010652 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 Py_DECREF(sub);
10654 return -1;
10655 }
10656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 kind1 = PyUnicode_KIND(str);
10658 kind2 = PyUnicode_KIND(sub);
10659 kind = kind1 > kind2 ? kind1 : kind2;
10660 buf1 = PyUnicode_DATA(str);
10661 buf2 = PyUnicode_DATA(sub);
10662 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010663 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (!buf1) {
10665 Py_DECREF(sub);
10666 return -1;
10667 }
10668 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010669 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 if (!buf2) {
10671 Py_DECREF(sub);
10672 if (kind1 != kind) PyMem_Free(buf1);
10673 return -1;
10674 }
10675 len1 = PyUnicode_GET_LENGTH(str);
10676 len2 = PyUnicode_GET_LENGTH(sub);
10677
10678 switch(kind) {
10679 case PyUnicode_1BYTE_KIND:
10680 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10681 break;
10682 case PyUnicode_2BYTE_KIND:
10683 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10684 break;
10685 case PyUnicode_4BYTE_KIND:
10686 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10687 break;
10688 default:
10689 result = -1;
10690 assert(0);
10691 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010692
10693 Py_DECREF(str);
10694 Py_DECREF(sub);
10695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 if (kind1 != kind)
10697 PyMem_Free(buf1);
10698 if (kind2 != kind)
10699 PyMem_Free(buf2);
10700
Guido van Rossum403d68b2000-03-13 15:55:09 +000010701 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010702}
10703
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704/* Concat to string or Unicode object giving a new Unicode object. */
10705
Alexander Belopolsky40018472011-02-26 01:02:56 +000010706PyObject *
10707PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010710 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711
10712 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010715 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719
10720 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010721 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010725 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728 }
10729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010731 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10732 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 w = PyUnicode_New(
10736 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10737 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010739 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010740 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10741 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742 Py_DECREF(u);
10743 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010744 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748 Py_XDECREF(u);
10749 Py_XDECREF(v);
10750 return NULL;
10751}
10752
Victor Stinnerb0923652011-10-04 01:17:31 +020010753static void
10754unicode_append_inplace(PyObject **p_left, PyObject *right)
10755{
10756 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010757
10758 assert(PyUnicode_IS_READY(*p_left));
10759 assert(PyUnicode_IS_READY(right));
10760
10761 left_len = PyUnicode_GET_LENGTH(*p_left);
10762 right_len = PyUnicode_GET_LENGTH(right);
10763 if (left_len > PY_SSIZE_T_MAX - right_len) {
10764 PyErr_SetString(PyExc_OverflowError,
10765 "strings are too large to concat");
10766 goto error;
10767 }
10768 new_len = left_len + right_len;
10769
10770 /* Now we own the last reference to 'left', so we can resize it
10771 * in-place.
10772 */
10773 if (unicode_resize(p_left, new_len) != 0) {
10774 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10775 * deallocated so it cannot be put back into
10776 * 'variable'. The MemoryError is raised when there
10777 * is no value in 'variable', which might (very
10778 * remotely) be a cause of incompatibilities.
10779 */
10780 goto error;
10781 }
10782 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010783 copy_characters(*p_left, left_len, right, 0, right_len);
10784 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010785 return;
10786
10787error:
10788 Py_DECREF(*p_left);
10789 *p_left = NULL;
10790}
10791
Walter Dörwald1ab83302007-05-18 17:15:44 +000010792void
Victor Stinner23e56682011-10-03 03:54:37 +020010793PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010794{
Victor Stinner23e56682011-10-03 03:54:37 +020010795 PyObject *left, *res;
10796
10797 if (p_left == NULL) {
10798 if (!PyErr_Occurred())
10799 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010800 return;
10801 }
Victor Stinner23e56682011-10-03 03:54:37 +020010802 left = *p_left;
10803 if (right == NULL || !PyUnicode_Check(left)) {
10804 if (!PyErr_Occurred())
10805 PyErr_BadInternalCall();
10806 goto error;
10807 }
10808
Victor Stinnere1335c72011-10-04 20:53:03 +020010809 if (PyUnicode_READY(left))
10810 goto error;
10811 if (PyUnicode_READY(right))
10812 goto error;
10813
Victor Stinner23e56682011-10-03 03:54:37 +020010814 if (PyUnicode_CheckExact(left) && left != unicode_empty
10815 && PyUnicode_CheckExact(right) && right != unicode_empty
10816 && unicode_resizable(left)
10817 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10818 || _PyUnicode_WSTR(left) != NULL))
10819 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010820 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10821 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010822 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010823 not so different than duplicating the string. */
10824 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010825 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010826 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010827 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010828 return;
10829 }
10830 }
10831
10832 res = PyUnicode_Concat(left, right);
10833 if (res == NULL)
10834 goto error;
10835 Py_DECREF(left);
10836 *p_left = res;
10837 return;
10838
10839error:
10840 Py_DECREF(*p_left);
10841 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010842}
10843
10844void
10845PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10846{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010847 PyUnicode_Append(pleft, right);
10848 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010849}
10850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010851PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010852 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010854Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010855string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010856interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857
10858static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010859unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010861 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010862 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010863 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 int kind1, kind2, kind;
10866 void *buf1, *buf2;
10867 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868
Jesus Ceaac451502011-04-20 17:09:23 +020010869 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10870 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 kind1 = PyUnicode_KIND(self);
10874 kind2 = PyUnicode_KIND(substring);
10875 kind = kind1 > kind2 ? kind1 : kind2;
10876 buf1 = PyUnicode_DATA(self);
10877 buf2 = PyUnicode_DATA(substring);
10878 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010879 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 if (!buf1) {
10881 Py_DECREF(substring);
10882 return NULL;
10883 }
10884 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010885 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 if (!buf2) {
10887 Py_DECREF(substring);
10888 if (kind1 != kind) PyMem_Free(buf1);
10889 return NULL;
10890 }
10891 len1 = PyUnicode_GET_LENGTH(self);
10892 len2 = PyUnicode_GET_LENGTH(substring);
10893
10894 ADJUST_INDICES(start, end, len1);
10895 switch(kind) {
10896 case PyUnicode_1BYTE_KIND:
10897 iresult = ucs1lib_count(
10898 ((Py_UCS1*)buf1) + start, end - start,
10899 buf2, len2, PY_SSIZE_T_MAX
10900 );
10901 break;
10902 case PyUnicode_2BYTE_KIND:
10903 iresult = ucs2lib_count(
10904 ((Py_UCS2*)buf1) + start, end - start,
10905 buf2, len2, PY_SSIZE_T_MAX
10906 );
10907 break;
10908 case PyUnicode_4BYTE_KIND:
10909 iresult = ucs4lib_count(
10910 ((Py_UCS4*)buf1) + start, end - start,
10911 buf2, len2, PY_SSIZE_T_MAX
10912 );
10913 break;
10914 default:
10915 assert(0); iresult = 0;
10916 }
10917
10918 result = PyLong_FromSsize_t(iresult);
10919
10920 if (kind1 != kind)
10921 PyMem_Free(buf1);
10922 if (kind2 != kind)
10923 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924
10925 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010926
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927 return result;
10928}
10929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010930PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010931 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010933Encode S using the codec registered for encoding. Default encoding\n\
10934is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010935handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010936a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10937'xmlcharrefreplace' as well as any other name registered with\n\
10938codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939
10940static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010941unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010943 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944 char *encoding = NULL;
10945 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010946
Benjamin Peterson308d6372009-09-18 21:42:35 +000010947 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10948 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010950 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010951}
10952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010953PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010954 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955\n\
10956Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010957If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
10959static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010960unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010962 Py_ssize_t i, j, line_pos, src_len, incr;
10963 Py_UCS4 ch;
10964 PyObject *u;
10965 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010967 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010968 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969
10970 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010971 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972
Antoine Pitrou22425222011-10-04 19:10:51 +020010973 if (PyUnicode_READY(self) == -1)
10974 return NULL;
10975
Thomas Wouters7e474022000-07-16 12:04:32 +000010976 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010977 src_len = PyUnicode_GET_LENGTH(self);
10978 i = j = line_pos = 0;
10979 kind = PyUnicode_KIND(self);
10980 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010981 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010982 for (; i < src_len; i++) {
10983 ch = PyUnicode_READ(kind, src_data, i);
10984 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010985 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010987 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010988 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010989 goto overflow;
10990 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010992 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010996 goto overflow;
10997 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010999 if (ch == '\n' || ch == '\r')
11000 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011002 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011003 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010011004 Py_INCREF(self);
11005 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011006 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011007
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011009 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010 if (!u)
11011 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011012 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013
Antoine Pitroue71d5742011-10-04 15:55:09 +020011014 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
Antoine Pitroue71d5742011-10-04 15:55:09 +020011016 for (; i < src_len; i++) {
11017 ch = PyUnicode_READ(kind, src_data, i);
11018 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011019 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011020 incr = tabsize - (line_pos % tabsize);
11021 line_pos += incr;
11022 while (incr--) {
11023 PyUnicode_WRITE(kind, dest_data, j, ' ');
11024 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011025 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011027 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011029 line_pos++;
11030 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011031 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011032 if (ch == '\n' || ch == '\r')
11033 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011035 }
11036 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011037 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011038
Antoine Pitroue71d5742011-10-04 15:55:09 +020011039 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011040 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11041 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042}
11043
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011044PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011045 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046\n\
11047Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011048such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049arguments start and end are interpreted as in slice notation.\n\
11050\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011051Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052
11053static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011056 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011057 Py_ssize_t start;
11058 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011059 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060
Jesus Ceaac451502011-04-20 17:09:23 +020011061 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11062 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 if (PyUnicode_READY(self) == -1)
11066 return NULL;
11067 if (PyUnicode_READY(substring) == -1)
11068 return NULL;
11069
Victor Stinner7931d9a2011-11-04 00:22:48 +010011070 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071
11072 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 if (result == -2)
11075 return NULL;
11076
Christian Heimes217cfd12007-12-02 14:31:20 +000011077 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078}
11079
11080static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011081unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011083 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11084 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087}
11088
Guido van Rossumc2504932007-09-18 19:42:40 +000011089/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011090 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011091static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011092unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011093{
Guido van Rossumc2504932007-09-18 19:42:40 +000011094 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011095 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 if (_PyUnicode_HASH(self) != -1)
11098 return _PyUnicode_HASH(self);
11099 if (PyUnicode_READY(self) == -1)
11100 return -1;
11101 len = PyUnicode_GET_LENGTH(self);
11102
11103 /* The hash function as a macro, gets expanded three times below. */
11104#define HASH(P) \
11105 x = (Py_uhash_t)*P << 7; \
11106 while (--len >= 0) \
11107 x = (1000003*x) ^ (Py_uhash_t)*P++;
11108
11109 switch (PyUnicode_KIND(self)) {
11110 case PyUnicode_1BYTE_KIND: {
11111 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11112 HASH(c);
11113 break;
11114 }
11115 case PyUnicode_2BYTE_KIND: {
11116 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11117 HASH(s);
11118 break;
11119 }
11120 default: {
11121 Py_UCS4 *l;
11122 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11123 "Impossible switch case in unicode_hash");
11124 l = PyUnicode_4BYTE_DATA(self);
11125 HASH(l);
11126 break;
11127 }
11128 }
11129 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11130
Guido van Rossumc2504932007-09-18 19:42:40 +000011131 if (x == -1)
11132 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011134 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011136#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011138PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011141Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142
11143static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011146 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011147 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011148 Py_ssize_t start;
11149 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150
Jesus Ceaac451502011-04-20 17:09:23 +020011151 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11152 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 if (PyUnicode_READY(self) == -1)
11156 return NULL;
11157 if (PyUnicode_READY(substring) == -1)
11158 return NULL;
11159
Victor Stinner7931d9a2011-11-04 00:22:48 +010011160 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161
11162 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 if (result == -2)
11165 return NULL;
11166
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167 if (result < 0) {
11168 PyErr_SetString(PyExc_ValueError, "substring not found");
11169 return NULL;
11170 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011171
Christian Heimes217cfd12007-12-02 14:31:20 +000011172 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173}
11174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011175PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011176 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011178Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011179at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
11181static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011182unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 Py_ssize_t i, length;
11185 int kind;
11186 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187 int cased;
11188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 if (PyUnicode_READY(self) == -1)
11190 return NULL;
11191 length = PyUnicode_GET_LENGTH(self);
11192 kind = PyUnicode_KIND(self);
11193 data = PyUnicode_DATA(self);
11194
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (length == 1)
11197 return PyBool_FromLong(
11198 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011200 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011202 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011203
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 for (i = 0; i < length; i++) {
11206 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011207
Benjamin Peterson29060642009-01-31 22:14:21 +000011208 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11209 return PyBool_FromLong(0);
11210 else if (!cased && Py_UNICODE_ISLOWER(ch))
11211 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011213 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214}
11215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011216PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011219Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011220at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221
11222static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011223unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 Py_ssize_t i, length;
11226 int kind;
11227 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 int cased;
11229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 if (PyUnicode_READY(self) == -1)
11231 return NULL;
11232 length = PyUnicode_GET_LENGTH(self);
11233 kind = PyUnicode_KIND(self);
11234 data = PyUnicode_DATA(self);
11235
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011237 if (length == 1)
11238 return PyBool_FromLong(
11239 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011241 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011243 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011244
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 for (i = 0; i < length; i++) {
11247 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011248
Benjamin Peterson29060642009-01-31 22:14:21 +000011249 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11250 return PyBool_FromLong(0);
11251 else if (!cased && Py_UNICODE_ISUPPER(ch))
11252 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011254 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255}
11256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011257PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011260Return True if S is a titlecased string and there is at least one\n\
11261character in S, i.e. upper- and titlecase characters may only\n\
11262follow uncased characters and lowercase characters only cased ones.\n\
11263Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264
11265static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011266unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 Py_ssize_t i, length;
11269 int kind;
11270 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271 int cased, previous_is_cased;
11272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273 if (PyUnicode_READY(self) == -1)
11274 return NULL;
11275 length = PyUnicode_GET_LENGTH(self);
11276 kind = PyUnicode_KIND(self);
11277 data = PyUnicode_DATA(self);
11278
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011280 if (length == 1) {
11281 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11282 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11283 (Py_UNICODE_ISUPPER(ch) != 0));
11284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011286 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011288 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011289
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290 cased = 0;
11291 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292 for (i = 0; i < length; i++) {
11293 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011294
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11296 if (previous_is_cased)
11297 return PyBool_FromLong(0);
11298 previous_is_cased = 1;
11299 cased = 1;
11300 }
11301 else if (Py_UNICODE_ISLOWER(ch)) {
11302 if (!previous_is_cased)
11303 return PyBool_FromLong(0);
11304 previous_is_cased = 1;
11305 cased = 1;
11306 }
11307 else
11308 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011310 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311}
11312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011313PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011314 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011316Return True if all characters in S are whitespace\n\
11317and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318
11319static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011320unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 Py_ssize_t i, length;
11323 int kind;
11324 void *data;
11325
11326 if (PyUnicode_READY(self) == -1)
11327 return NULL;
11328 length = PyUnicode_GET_LENGTH(self);
11329 kind = PyUnicode_KIND(self);
11330 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 if (length == 1)
11334 return PyBool_FromLong(
11335 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011337 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011339 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 for (i = 0; i < length; i++) {
11342 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011343 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011346 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347}
11348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011351\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011352Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011354
11355static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011356unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 Py_ssize_t i, length;
11359 int kind;
11360 void *data;
11361
11362 if (PyUnicode_READY(self) == -1)
11363 return NULL;
11364 length = PyUnicode_GET_LENGTH(self);
11365 kind = PyUnicode_KIND(self);
11366 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011367
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011368 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (length == 1)
11370 return PyBool_FromLong(
11371 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011372
11373 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 for (i = 0; i < length; i++) {
11378 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011380 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011381 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011382}
11383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011384PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011386\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011387Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011388and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011389
11390static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011391unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 int kind;
11394 void *data;
11395 Py_ssize_t len, i;
11396
11397 if (PyUnicode_READY(self) == -1)
11398 return NULL;
11399
11400 kind = PyUnicode_KIND(self);
11401 data = PyUnicode_DATA(self);
11402 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011403
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011404 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 if (len == 1) {
11406 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11407 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11408 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011409
11410 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011412 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011414 for (i = 0; i < len; i++) {
11415 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011416 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011417 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011418 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011419 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011420}
11421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011422PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011425Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011426False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427
11428static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011429unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 Py_ssize_t i, length;
11432 int kind;
11433 void *data;
11434
11435 if (PyUnicode_READY(self) == -1)
11436 return NULL;
11437 length = PyUnicode_GET_LENGTH(self);
11438 kind = PyUnicode_KIND(self);
11439 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 if (length == 1)
11443 return PyBool_FromLong(
11444 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011446 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 for (i = 0; i < length; i++) {
11451 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011454 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455}
11456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011457PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011460Return True if all characters in S are digits\n\
11461and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
11463static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011464unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 Py_ssize_t i, length;
11467 int kind;
11468 void *data;
11469
11470 if (PyUnicode_READY(self) == -1)
11471 return NULL;
11472 length = PyUnicode_GET_LENGTH(self);
11473 kind = PyUnicode_KIND(self);
11474 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 if (length == 1) {
11478 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11479 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11480 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011482 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 for (i = 0; i < length; i++) {
11487 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011490 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491}
11492
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011493PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011496Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011497False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
11499static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011500unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 Py_ssize_t i, length;
11503 int kind;
11504 void *data;
11505
11506 if (PyUnicode_READY(self) == -1)
11507 return NULL;
11508 length = PyUnicode_GET_LENGTH(self);
11509 kind = PyUnicode_KIND(self);
11510 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 if (length == 1)
11514 return PyBool_FromLong(
11515 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011517 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 for (i = 0; i < length; i++) {
11522 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011525 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526}
11527
Martin v. Löwis47383402007-08-15 07:32:56 +000011528int
11529PyUnicode_IsIdentifier(PyObject *self)
11530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 int kind;
11532 void *data;
11533 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011534 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 if (PyUnicode_READY(self) == -1) {
11537 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 }
11540
11541 /* Special case for empty strings */
11542 if (PyUnicode_GET_LENGTH(self) == 0)
11543 return 0;
11544 kind = PyUnicode_KIND(self);
11545 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011546
11547 /* PEP 3131 says that the first character must be in
11548 XID_Start and subsequent characters in XID_Continue,
11549 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011550 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011551 letters, digits, underscore). However, given the current
11552 definition of XID_Start and XID_Continue, it is sufficient
11553 to check just for these, except that _ must be allowed
11554 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011556 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011557 return 0;
11558
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011559 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011562 return 1;
11563}
11564
11565PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011566 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011567\n\
11568Return True if S is a valid identifier according\n\
11569to the language definition.");
11570
11571static PyObject*
11572unicode_isidentifier(PyObject *self)
11573{
11574 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11575}
11576
Georg Brandl559e5d72008-06-11 18:37:52 +000011577PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011579\n\
11580Return True if all characters in S are considered\n\
11581printable in repr() or S is empty, False otherwise.");
11582
11583static PyObject*
11584unicode_isprintable(PyObject *self)
11585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 Py_ssize_t i, length;
11587 int kind;
11588 void *data;
11589
11590 if (PyUnicode_READY(self) == -1)
11591 return NULL;
11592 length = PyUnicode_GET_LENGTH(self);
11593 kind = PyUnicode_KIND(self);
11594 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011595
11596 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011597 if (length == 1)
11598 return PyBool_FromLong(
11599 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 for (i = 0; i < length; i++) {
11602 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011603 Py_RETURN_FALSE;
11604 }
11605 }
11606 Py_RETURN_TRUE;
11607}
11608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011609PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011610 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611\n\
11612Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011613iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614
11615static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011616unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011618 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619}
11620
Martin v. Löwis18e16552006-02-15 17:27:45 +000011621static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011622unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 if (PyUnicode_READY(self) == -1)
11625 return -1;
11626 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627}
11628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011629PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011630 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011632Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011633done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634
11635static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011636unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011638 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 Py_UCS4 fillchar = ' ';
11640
11641 if (PyUnicode_READY(self) == -1)
11642 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011643
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011644 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645 return NULL;
11646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011649 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650 }
11651
Victor Stinner7931d9a2011-11-04 00:22:48 +010011652 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653}
11654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011655PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011658Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659
11660static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011661unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663 return fixup(self, fixlower);
11664}
11665
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011666#define LEFTSTRIP 0
11667#define RIGHTSTRIP 1
11668#define BOTHSTRIP 2
11669
11670/* Arrays indexed by above */
11671static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11672
11673#define STRIPNAME(i) (stripformat[i]+3)
11674
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011675/* externally visible for str.strip(unicode) */
11676PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011677_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 void *data;
11680 int kind;
11681 Py_ssize_t i, j, len;
11682 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11685 return NULL;
11686
11687 kind = PyUnicode_KIND(self);
11688 data = PyUnicode_DATA(self);
11689 len = PyUnicode_GET_LENGTH(self);
11690 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11691 PyUnicode_DATA(sepobj),
11692 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011693
Benjamin Peterson14339b62009-01-31 16:36:08 +000011694 i = 0;
11695 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 while (i < len &&
11697 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011698 i++;
11699 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011700 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011701
Benjamin Peterson14339b62009-01-31 16:36:08 +000011702 j = len;
11703 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011704 do {
11705 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 } while (j >= i &&
11707 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011709 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011710
Victor Stinner7931d9a2011-11-04 00:22:48 +010011711 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712}
11713
11714PyObject*
11715PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11716{
11717 unsigned char *data;
11718 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011719 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720
Victor Stinnerde636f32011-10-01 03:55:54 +020011721 if (PyUnicode_READY(self) == -1)
11722 return NULL;
11723
11724 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11725
Victor Stinner12bab6d2011-10-01 01:53:49 +020011726 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011728 if (PyUnicode_CheckExact(self)) {
11729 Py_INCREF(self);
11730 return self;
11731 }
11732 else
11733 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 }
11735
Victor Stinner12bab6d2011-10-01 01:53:49 +020011736 length = end - start;
11737 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011738 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739
Victor Stinnerde636f32011-10-01 03:55:54 +020011740 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011741 PyErr_SetString(PyExc_IndexError, "string index out of range");
11742 return NULL;
11743 }
11744
Victor Stinnerb9275c12011-10-05 14:01:42 +020011745 if (PyUnicode_IS_ASCII(self)) {
11746 kind = PyUnicode_KIND(self);
11747 data = PyUnicode_1BYTE_DATA(self);
11748 return unicode_fromascii(data + start, length);
11749 }
11750 else {
11751 kind = PyUnicode_KIND(self);
11752 data = PyUnicode_1BYTE_DATA(self);
11753 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011754 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011755 length);
11756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758
11759static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011760do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 int kind;
11763 void *data;
11764 Py_ssize_t len, i, j;
11765
11766 if (PyUnicode_READY(self) == -1)
11767 return NULL;
11768
11769 kind = PyUnicode_KIND(self);
11770 data = PyUnicode_DATA(self);
11771 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011772
Benjamin Peterson14339b62009-01-31 16:36:08 +000011773 i = 0;
11774 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011776 i++;
11777 }
11778 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011779
Benjamin Peterson14339b62009-01-31 16:36:08 +000011780 j = len;
11781 if (striptype != LEFTSTRIP) {
11782 do {
11783 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011785 j++;
11786 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787
Victor Stinner7931d9a2011-11-04 00:22:48 +010011788 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789}
11790
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011791
11792static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011793do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011794{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011795 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011796
Benjamin Peterson14339b62009-01-31 16:36:08 +000011797 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11798 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011799
Benjamin Peterson14339b62009-01-31 16:36:08 +000011800 if (sep != NULL && sep != Py_None) {
11801 if (PyUnicode_Check(sep))
11802 return _PyUnicode_XStrip(self, striptype, sep);
11803 else {
11804 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011805 "%s arg must be None or str",
11806 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011807 return NULL;
11808 }
11809 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011810
Benjamin Peterson14339b62009-01-31 16:36:08 +000011811 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011812}
11813
11814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011815PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011817\n\
11818Return a copy of the string S with leading and trailing\n\
11819whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011820If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011821
11822static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011823unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011824{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011825 if (PyTuple_GET_SIZE(args) == 0)
11826 return do_strip(self, BOTHSTRIP); /* Common case */
11827 else
11828 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011829}
11830
11831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011832PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011834\n\
11835Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011836If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011837
11838static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011839unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011840{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011841 if (PyTuple_GET_SIZE(args) == 0)
11842 return do_strip(self, LEFTSTRIP); /* Common case */
11843 else
11844 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011845}
11846
11847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011848PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011850\n\
11851Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011852If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011853
11854static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011855unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011856{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011857 if (PyTuple_GET_SIZE(args) == 0)
11858 return do_strip(self, RIGHTSTRIP); /* Common case */
11859 else
11860 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011861}
11862
11863
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011865unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011867 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869
Georg Brandl222de0f2009-04-12 12:01:50 +000011870 if (len < 1) {
11871 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011872 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011873 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874
Tim Peters7a29bd52001-09-12 03:03:31 +000011875 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 /* no repeat, return original string */
11877 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011878 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879 }
Tim Peters8f422462000-09-09 06:13:41 +000011880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (PyUnicode_READY(str) == -1)
11882 return NULL;
11883
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011884 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011885 PyErr_SetString(PyExc_OverflowError,
11886 "repeated string is too long");
11887 return NULL;
11888 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011890
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011891 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892 if (!u)
11893 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011894 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 if (PyUnicode_GET_LENGTH(str) == 1) {
11897 const int kind = PyUnicode_KIND(str);
11898 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11899 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011900 if (kind == PyUnicode_1BYTE_KIND)
11901 memset(to, (unsigned char)fill_char, len);
11902 else {
11903 for (n = 0; n < len; ++n)
11904 PyUnicode_WRITE(kind, to, n, fill_char);
11905 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 }
11907 else {
11908 /* number of characters copied this far */
11909 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011910 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 char *to = (char *) PyUnicode_DATA(u);
11912 Py_MEMCPY(to, PyUnicode_DATA(str),
11913 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011914 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 n = (done <= nchars-done) ? done : nchars-done;
11916 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011917 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 }
11920
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011921 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011922 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923}
11924
Alexander Belopolsky40018472011-02-26 01:02:56 +000011925PyObject *
11926PyUnicode_Replace(PyObject *obj,
11927 PyObject *subobj,
11928 PyObject *replobj,
11929 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930{
11931 PyObject *self;
11932 PyObject *str1;
11933 PyObject *str2;
11934 PyObject *result;
11935
11936 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011937 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011940 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011941 Py_DECREF(self);
11942 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 }
11944 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011945 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 Py_DECREF(self);
11947 Py_DECREF(str1);
11948 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951 Py_DECREF(self);
11952 Py_DECREF(str1);
11953 Py_DECREF(str2);
11954 return result;
11955}
11956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011957PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011958 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959\n\
11960Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011961old replaced by new. If the optional argument count is\n\
11962given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963
11964static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 PyObject *str1;
11968 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011969 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970 PyObject *result;
11971
Martin v. Löwis18e16552006-02-15 17:27:45 +000011972 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011975 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 str1 = PyUnicode_FromObject(str1);
11977 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11978 return NULL;
11979 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011980 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 Py_DECREF(str1);
11982 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984
11985 result = replace(self, str1, str2, maxcount);
11986
11987 Py_DECREF(str1);
11988 Py_DECREF(str2);
11989 return result;
11990}
11991
Alexander Belopolsky40018472011-02-26 01:02:56 +000011992static PyObject *
11993unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011995 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 Py_ssize_t isize;
11997 Py_ssize_t osize, squote, dquote, i, o;
11998 Py_UCS4 max, quote;
11999 int ikind, okind;
12000 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012003 return NULL;
12004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 isize = PyUnicode_GET_LENGTH(unicode);
12006 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 /* Compute length of output, quote characters, and
12009 maximum character */
12010 osize = 2; /* quotes */
12011 max = 127;
12012 squote = dquote = 0;
12013 ikind = PyUnicode_KIND(unicode);
12014 for (i = 0; i < isize; i++) {
12015 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12016 switch (ch) {
12017 case '\'': squote++; osize++; break;
12018 case '"': dquote++; osize++; break;
12019 case '\\': case '\t': case '\r': case '\n':
12020 osize += 2; break;
12021 default:
12022 /* Fast-path ASCII */
12023 if (ch < ' ' || ch == 0x7f)
12024 osize += 4; /* \xHH */
12025 else if (ch < 0x7f)
12026 osize++;
12027 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12028 osize++;
12029 max = ch > max ? ch : max;
12030 }
12031 else if (ch < 0x100)
12032 osize += 4; /* \xHH */
12033 else if (ch < 0x10000)
12034 osize += 6; /* \uHHHH */
12035 else
12036 osize += 10; /* \uHHHHHHHH */
12037 }
12038 }
12039
12040 quote = '\'';
12041 if (squote) {
12042 if (dquote)
12043 /* Both squote and dquote present. Use squote,
12044 and escape them */
12045 osize += squote;
12046 else
12047 quote = '"';
12048 }
12049
12050 repr = PyUnicode_New(osize, max);
12051 if (repr == NULL)
12052 return NULL;
12053 okind = PyUnicode_KIND(repr);
12054 odata = PyUnicode_DATA(repr);
12055
12056 PyUnicode_WRITE(okind, odata, 0, quote);
12057 PyUnicode_WRITE(okind, odata, osize-1, quote);
12058
12059 for (i = 0, o = 1; i < isize; i++) {
12060 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012061
12062 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 if ((ch == quote) || (ch == '\\')) {
12064 PyUnicode_WRITE(okind, odata, o++, '\\');
12065 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012066 continue;
12067 }
12068
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012070 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 PyUnicode_WRITE(okind, odata, o++, '\\');
12072 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012073 }
12074 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 PyUnicode_WRITE(okind, odata, o++, '\\');
12076 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012077 }
12078 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 PyUnicode_WRITE(okind, odata, o++, '\\');
12080 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012081 }
12082
12083 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012084 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 PyUnicode_WRITE(okind, odata, o++, '\\');
12086 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012087 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12088 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012089 }
12090
Georg Brandl559e5d72008-06-11 18:37:52 +000012091 /* Copy ASCII characters as-is */
12092 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012094 }
12095
Benjamin Peterson29060642009-01-31 22:14:21 +000012096 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012097 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012098 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012099 (categories Z* and C* except ASCII space)
12100 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012102 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 if (ch <= 0xff) {
12104 PyUnicode_WRITE(okind, odata, o++, '\\');
12105 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012106 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12107 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012108 }
12109 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110 else if (ch >= 0x10000) {
12111 PyUnicode_WRITE(okind, odata, o++, '\\');
12112 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012113 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12114 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12115 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12116 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12117 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12118 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12119 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12120 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012121 }
12122 /* Map 16-bit characters to '\uxxxx' */
12123 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 PyUnicode_WRITE(okind, odata, o++, '\\');
12125 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012126 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12127 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12128 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12129 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012130 }
12131 }
12132 /* Copy characters as-is */
12133 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012135 }
12136 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012139 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012140 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141}
12142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012143PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012144 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145\n\
12146Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012147such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148arguments start and end are interpreted as in slice notation.\n\
12149\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012150Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151
12152static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012155 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012156 Py_ssize_t start;
12157 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012158 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
Jesus Ceaac451502011-04-20 17:09:23 +020012160 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12161 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 if (PyUnicode_READY(self) == -1)
12165 return NULL;
12166 if (PyUnicode_READY(substring) == -1)
12167 return NULL;
12168
Victor Stinner7931d9a2011-11-04 00:22:48 +010012169 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170
12171 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012172
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 if (result == -2)
12174 return NULL;
12175
Christian Heimes217cfd12007-12-02 14:31:20 +000012176 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177}
12178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012179PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012182Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183
12184static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012187 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012188 Py_ssize_t start;
12189 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012190 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191
Jesus Ceaac451502011-04-20 17:09:23 +020012192 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12193 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012194 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 if (PyUnicode_READY(self) == -1)
12197 return NULL;
12198 if (PyUnicode_READY(substring) == -1)
12199 return NULL;
12200
Victor Stinner7931d9a2011-11-04 00:22:48 +010012201 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202
12203 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 if (result == -2)
12206 return NULL;
12207
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208 if (result < 0) {
12209 PyErr_SetString(PyExc_ValueError, "substring not found");
12210 return NULL;
12211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012212
Christian Heimes217cfd12007-12-02 14:31:20 +000012213 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214}
12215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012216PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012217 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012219Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012220done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221
12222static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012223unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012225 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 Py_UCS4 fillchar = ' ';
12227
Victor Stinnere9a29352011-10-01 02:14:59 +020012228 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012230
Victor Stinnere9a29352011-10-01 02:14:59 +020012231 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 return NULL;
12233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012236 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237 }
12238
Victor Stinner7931d9a2011-11-04 00:22:48 +010012239 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240}
12241
Alexander Belopolsky40018472011-02-26 01:02:56 +000012242PyObject *
12243PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244{
12245 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012246
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247 s = PyUnicode_FromObject(s);
12248 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012249 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 if (sep != NULL) {
12251 sep = PyUnicode_FromObject(sep);
12252 if (sep == NULL) {
12253 Py_DECREF(s);
12254 return NULL;
12255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 }
12257
Victor Stinner9310abb2011-10-05 00:59:23 +020012258 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259
12260 Py_DECREF(s);
12261 Py_XDECREF(sep);
12262 return result;
12263}
12264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012265PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012266 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267\n\
12268Return a list of the words in S, using sep as the\n\
12269delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012270splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012271whitespace string is a separator and empty strings are\n\
12272removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273
12274static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012275unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276{
12277 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012278 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279
Martin v. Löwis18e16552006-02-15 17:27:45 +000012280 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281 return NULL;
12282
12283 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012284 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012286 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012288 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289}
12290
Thomas Wouters477c8d52006-05-27 19:21:47 +000012291PyObject *
12292PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12293{
12294 PyObject* str_obj;
12295 PyObject* sep_obj;
12296 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 int kind1, kind2, kind;
12298 void *buf1 = NULL, *buf2 = NULL;
12299 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012300
12301 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012302 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012304 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306 Py_DECREF(str_obj);
12307 return NULL;
12308 }
12309
Victor Stinner14f8f022011-10-05 20:58:25 +020012310 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012312 kind = Py_MAX(kind1, kind2);
12313 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012315 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 if (!buf1)
12317 goto onError;
12318 buf2 = PyUnicode_DATA(sep_obj);
12319 if (kind2 != kind)
12320 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12321 if (!buf2)
12322 goto onError;
12323 len1 = PyUnicode_GET_LENGTH(str_obj);
12324 len2 = PyUnicode_GET_LENGTH(sep_obj);
12325
Victor Stinner14f8f022011-10-05 20:58:25 +020012326 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012328 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12329 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12330 else
12331 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 break;
12333 case PyUnicode_2BYTE_KIND:
12334 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12335 break;
12336 case PyUnicode_4BYTE_KIND:
12337 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12338 break;
12339 default:
12340 assert(0);
12341 out = 0;
12342 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012343
12344 Py_DECREF(sep_obj);
12345 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 if (kind1 != kind)
12347 PyMem_Free(buf1);
12348 if (kind2 != kind)
12349 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012350
12351 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 onError:
12353 Py_DECREF(sep_obj);
12354 Py_DECREF(str_obj);
12355 if (kind1 != kind && buf1)
12356 PyMem_Free(buf1);
12357 if (kind2 != kind && buf2)
12358 PyMem_Free(buf2);
12359 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012360}
12361
12362
12363PyObject *
12364PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12365{
12366 PyObject* str_obj;
12367 PyObject* sep_obj;
12368 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 int kind1, kind2, kind;
12370 void *buf1 = NULL, *buf2 = NULL;
12371 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012372
12373 str_obj = PyUnicode_FromObject(str_in);
12374 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012376 sep_obj = PyUnicode_FromObject(sep_in);
12377 if (!sep_obj) {
12378 Py_DECREF(str_obj);
12379 return NULL;
12380 }
12381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 kind1 = PyUnicode_KIND(str_in);
12383 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012384 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012385 buf1 = PyUnicode_DATA(str_in);
12386 if (kind1 != kind)
12387 buf1 = _PyUnicode_AsKind(str_in, kind);
12388 if (!buf1)
12389 goto onError;
12390 buf2 = PyUnicode_DATA(sep_obj);
12391 if (kind2 != kind)
12392 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12393 if (!buf2)
12394 goto onError;
12395 len1 = PyUnicode_GET_LENGTH(str_obj);
12396 len2 = PyUnicode_GET_LENGTH(sep_obj);
12397
12398 switch(PyUnicode_KIND(str_in)) {
12399 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012400 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12401 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12402 else
12403 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 break;
12405 case PyUnicode_2BYTE_KIND:
12406 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12407 break;
12408 case PyUnicode_4BYTE_KIND:
12409 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12410 break;
12411 default:
12412 assert(0);
12413 out = 0;
12414 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012415
12416 Py_DECREF(sep_obj);
12417 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 if (kind1 != kind)
12419 PyMem_Free(buf1);
12420 if (kind2 != kind)
12421 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012422
12423 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 onError:
12425 Py_DECREF(sep_obj);
12426 Py_DECREF(str_obj);
12427 if (kind1 != kind && buf1)
12428 PyMem_Free(buf1);
12429 if (kind2 != kind && buf2)
12430 PyMem_Free(buf2);
12431 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012432}
12433
12434PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012435 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012436\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012437Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012438the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012439found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012440
12441static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012442unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012443{
Victor Stinner9310abb2011-10-05 00:59:23 +020012444 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012445}
12446
12447PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012448 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012449\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012450Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012452separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453
12454static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012455unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012456{
Victor Stinner9310abb2011-10-05 00:59:23 +020012457 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012458}
12459
Alexander Belopolsky40018472011-02-26 01:02:56 +000012460PyObject *
12461PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012462{
12463 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012465 s = PyUnicode_FromObject(s);
12466 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012467 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012468 if (sep != NULL) {
12469 sep = PyUnicode_FromObject(sep);
12470 if (sep == NULL) {
12471 Py_DECREF(s);
12472 return NULL;
12473 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012474 }
12475
Victor Stinner9310abb2011-10-05 00:59:23 +020012476 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012477
12478 Py_DECREF(s);
12479 Py_XDECREF(sep);
12480 return result;
12481}
12482
12483PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012484 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012485\n\
12486Return a list of the words in S, using sep as the\n\
12487delimiter string, starting at the end of the string and\n\
12488working to the front. If maxsplit is given, at most maxsplit\n\
12489splits are done. If sep is not specified, any whitespace string\n\
12490is a separator.");
12491
12492static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012493unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012494{
12495 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012496 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012497
Martin v. Löwis18e16552006-02-15 17:27:45 +000012498 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012499 return NULL;
12500
12501 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012502 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012503 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012504 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012505 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012506 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012507}
12508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012509PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012510 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511\n\
12512Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012513Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012514is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
12516static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012517unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012519 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012520 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012522 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12523 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524 return NULL;
12525
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012526 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527}
12528
12529static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012530PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531{
Walter Dörwald346737f2007-05-31 10:44:43 +000012532 if (PyUnicode_CheckExact(self)) {
12533 Py_INCREF(self);
12534 return self;
12535 } else
12536 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012537 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538}
12539
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012540PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012541 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542\n\
12543Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012544and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545
12546static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012547unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549 return fixup(self, fixswapcase);
12550}
12551
Georg Brandlceee0772007-11-27 23:48:05 +000012552PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012553 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012554\n\
12555Return a translation table usable for str.translate().\n\
12556If there is only one argument, it must be a dictionary mapping Unicode\n\
12557ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012558Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012559If there are two arguments, they must be strings of equal length, and\n\
12560in the resulting dictionary, each character in x will be mapped to the\n\
12561character at the same position in y. If there is a third argument, it\n\
12562must be a string, whose characters will be mapped to None in the result.");
12563
12564static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012565unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012566{
12567 PyObject *x, *y = NULL, *z = NULL;
12568 PyObject *new = NULL, *key, *value;
12569 Py_ssize_t i = 0;
12570 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012571
Georg Brandlceee0772007-11-27 23:48:05 +000012572 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12573 return NULL;
12574 new = PyDict_New();
12575 if (!new)
12576 return NULL;
12577 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 int x_kind, y_kind, z_kind;
12579 void *x_data, *y_data, *z_data;
12580
Georg Brandlceee0772007-11-27 23:48:05 +000012581 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012582 if (!PyUnicode_Check(x)) {
12583 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12584 "be a string if there is a second argument");
12585 goto err;
12586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012587 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012588 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12589 "arguments must have equal length");
12590 goto err;
12591 }
12592 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 x_kind = PyUnicode_KIND(x);
12594 y_kind = PyUnicode_KIND(y);
12595 x_data = PyUnicode_DATA(x);
12596 y_data = PyUnicode_DATA(y);
12597 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12598 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12599 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012600 if (!key || !value)
12601 goto err;
12602 res = PyDict_SetItem(new, key, value);
12603 Py_DECREF(key);
12604 Py_DECREF(value);
12605 if (res < 0)
12606 goto err;
12607 }
12608 /* create entries for deleting chars in z */
12609 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 z_kind = PyUnicode_KIND(z);
12611 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012612 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012614 if (!key)
12615 goto err;
12616 res = PyDict_SetItem(new, key, Py_None);
12617 Py_DECREF(key);
12618 if (res < 0)
12619 goto err;
12620 }
12621 }
12622 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 int kind;
12624 void *data;
12625
Georg Brandlceee0772007-11-27 23:48:05 +000012626 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012627 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012628 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12629 "to maketrans it must be a dict");
12630 goto err;
12631 }
12632 /* copy entries into the new dict, converting string keys to int keys */
12633 while (PyDict_Next(x, &i, &key, &value)) {
12634 if (PyUnicode_Check(key)) {
12635 /* convert string keys to integer keys */
12636 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012637 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012638 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12639 "table must be of length 1");
12640 goto err;
12641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 kind = PyUnicode_KIND(key);
12643 data = PyUnicode_DATA(key);
12644 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012645 if (!newkey)
12646 goto err;
12647 res = PyDict_SetItem(new, newkey, value);
12648 Py_DECREF(newkey);
12649 if (res < 0)
12650 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012651 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012652 /* just keep integer keys */
12653 if (PyDict_SetItem(new, key, value) < 0)
12654 goto err;
12655 } else {
12656 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12657 "be strings or integers");
12658 goto err;
12659 }
12660 }
12661 }
12662 return new;
12663 err:
12664 Py_DECREF(new);
12665 return NULL;
12666}
12667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012668PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012669 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670\n\
12671Return a copy of the string S, where all characters have been mapped\n\
12672through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012673Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012674Unmapped characters are left untouched. Characters mapped to None\n\
12675are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676
12677static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681}
12682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012683PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012686Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687
12688static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012689unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691 return fixup(self, fixupper);
12692}
12693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012694PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012697Pad a numeric string S with zeros on the left, to fill a field\n\
12698of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699
12700static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012701unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012703 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012704 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012705 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 int kind;
12707 void *data;
12708 Py_UCS4 chr;
12709
12710 if (PyUnicode_READY(self) == -1)
12711 return NULL;
12712
Martin v. Löwis18e16552006-02-15 17:27:45 +000012713 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714 return NULL;
12715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012717 if (PyUnicode_CheckExact(self)) {
12718 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012719 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012720 }
12721 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012722 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012723 }
12724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726
12727 u = pad(self, fill, 0, '0');
12728
Walter Dörwald068325e2002-04-15 13:36:47 +000012729 if (u == NULL)
12730 return NULL;
12731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 kind = PyUnicode_KIND(u);
12733 data = PyUnicode_DATA(u);
12734 chr = PyUnicode_READ(kind, data, fill);
12735
12736 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 PyUnicode_WRITE(kind, data, 0, chr);
12739 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740 }
12741
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012742 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012743 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745
12746#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012747static PyObject *
12748unicode__decimal2ascii(PyObject *self)
12749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012751}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752#endif
12753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012754PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012755 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012757Return True if S starts with the specified prefix, False otherwise.\n\
12758With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012759With optional end, stop comparing S at that position.\n\
12760prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012761
12762static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012763unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012764 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012766 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012767 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012768 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012769 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012770 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771
Jesus Ceaac451502011-04-20 17:09:23 +020012772 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012774 if (PyTuple_Check(subobj)) {
12775 Py_ssize_t i;
12776 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012777 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012778 if (substring == NULL)
12779 return NULL;
12780 result = tailmatch(self, substring, start, end, -1);
12781 Py_DECREF(substring);
12782 if (result) {
12783 Py_RETURN_TRUE;
12784 }
12785 }
12786 /* nothing matched */
12787 Py_RETURN_FALSE;
12788 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012789 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012790 if (substring == NULL) {
12791 if (PyErr_ExceptionMatches(PyExc_TypeError))
12792 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12793 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012794 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012795 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012796 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012798 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799}
12800
12801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012802PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012805Return True if S ends with the specified suffix, False otherwise.\n\
12806With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012807With optional end, stop comparing S at that position.\n\
12808suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809
12810static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012811unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012812 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012814 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012815 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012816 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012817 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012818 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819
Jesus Ceaac451502011-04-20 17:09:23 +020012820 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012822 if (PyTuple_Check(subobj)) {
12823 Py_ssize_t i;
12824 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012825 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012826 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012827 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012829 result = tailmatch(self, substring, start, end, +1);
12830 Py_DECREF(substring);
12831 if (result) {
12832 Py_RETURN_TRUE;
12833 }
12834 }
12835 Py_RETURN_FALSE;
12836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012837 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012838 if (substring == NULL) {
12839 if (PyErr_ExceptionMatches(PyExc_TypeError))
12840 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12841 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012842 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012843 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012844 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012846 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847}
12848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012849#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012850
12851PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012852 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012853\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012854Return a formatted version of S, using substitutions from args and kwargs.\n\
12855The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012856
Eric Smith27bbca62010-11-04 17:06:58 +000012857PyDoc_STRVAR(format_map__doc__,
12858 "S.format_map(mapping) -> str\n\
12859\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012860Return a formatted version of S, using substitutions from mapping.\n\
12861The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012862
Eric Smith4a7d76d2008-05-30 18:10:19 +000012863static PyObject *
12864unicode__format__(PyObject* self, PyObject* args)
12865{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012866 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012867
12868 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12869 return NULL;
12870
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012871 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012873 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012874}
12875
Eric Smith8c663262007-08-25 02:26:07 +000012876PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012878\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012879Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012880
12881static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012882unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012883{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012884 Py_ssize_t size;
12885
12886 /* If it's a compact object, account for base structure +
12887 character data. */
12888 if (PyUnicode_IS_COMPACT_ASCII(v))
12889 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12890 else if (PyUnicode_IS_COMPACT(v))
12891 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012892 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 else {
12894 /* If it is a two-block object, account for base object, and
12895 for character block if present. */
12896 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012897 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012899 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012900 }
12901 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012902 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012903 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012905 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012906 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012907
12908 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012909}
12910
12911PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012912 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012913
12914static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012915unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012916{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012917 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918 if (!copy)
12919 return NULL;
12920 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012921}
12922
Guido van Rossumd57fd912000-03-10 22:53:23 +000012923static PyMethodDef unicode_methods[] = {
12924
12925 /* Order is according to common usage: often used methods should
12926 appear first, since lookup is done sequentially. */
12927
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012928 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012929 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12930 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012931 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012932 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12933 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12934 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12935 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12936 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12937 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12938 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012939 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012940 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12941 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12942 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012943 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012944 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12945 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12946 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012947 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012948 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012949 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012950 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012951 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12952 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12953 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12954 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12955 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12956 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12957 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12958 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12959 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12960 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12961 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12962 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12963 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12964 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012965 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012966 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012967 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012968 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012969 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012970 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012971 {"maketrans", (PyCFunction) unicode_maketrans,
12972 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012973 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012974#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012975 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976#endif
12977
12978#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012979 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012980 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012981#endif
12982
Benjamin Peterson14339b62009-01-31 16:36:08 +000012983 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984 {NULL, NULL}
12985};
12986
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012987static PyObject *
12988unicode_mod(PyObject *v, PyObject *w)
12989{
Brian Curtindfc80e32011-08-10 20:28:54 -050012990 if (!PyUnicode_Check(v))
12991 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012992 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012993}
12994
12995static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012996 0, /*nb_add*/
12997 0, /*nb_subtract*/
12998 0, /*nb_multiply*/
12999 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013000};
13001
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013003 (lenfunc) unicode_length, /* sq_length */
13004 PyUnicode_Concat, /* sq_concat */
13005 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13006 (ssizeargfunc) unicode_getitem, /* sq_item */
13007 0, /* sq_slice */
13008 0, /* sq_ass_item */
13009 0, /* sq_ass_slice */
13010 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011};
13012
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013013static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013014unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013015{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 if (PyUnicode_READY(self) == -1)
13017 return NULL;
13018
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013019 if (PyIndex_Check(item)) {
13020 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013021 if (i == -1 && PyErr_Occurred())
13022 return NULL;
13023 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013025 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013026 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013027 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013028 PyObject *result;
13029 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013030 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013031 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013034 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013035 return NULL;
13036 }
13037
13038 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039 return PyUnicode_New(0, 0);
13040 } else if (start == 0 && step == 1 &&
13041 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013042 PyUnicode_CheckExact(self)) {
13043 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013044 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013045 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013046 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013047 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013048 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013049 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013050 src_kind = PyUnicode_KIND(self);
13051 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013052 if (!PyUnicode_IS_ASCII(self)) {
13053 kind_limit = kind_maxchar_limit(src_kind);
13054 max_char = 0;
13055 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13056 ch = PyUnicode_READ(src_kind, src_data, cur);
13057 if (ch > max_char) {
13058 max_char = ch;
13059 if (max_char >= kind_limit)
13060 break;
13061 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013062 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013063 }
Victor Stinner55c99112011-10-13 01:17:06 +020013064 else
13065 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013066 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013067 if (result == NULL)
13068 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013069 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013070 dest_data = PyUnicode_DATA(result);
13071
13072 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013073 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13074 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013075 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013076 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013077 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013078 } else {
13079 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13080 return NULL;
13081 }
13082}
13083
13084static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013085 (lenfunc)unicode_length, /* mp_length */
13086 (binaryfunc)unicode_subscript, /* mp_subscript */
13087 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013088};
13089
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091/* Helpers for PyUnicode_Format() */
13092
13093static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013094getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013096 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013098 (*p_argidx)++;
13099 if (arglen < 0)
13100 return args;
13101 else
13102 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103 }
13104 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013105 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106 return NULL;
13107}
13108
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013109/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013111static PyObject *
13112formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013114 char *p;
13115 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013117
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118 x = PyFloat_AsDouble(v);
13119 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013120 return NULL;
13121
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013123 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013124
Eric Smith0923d1d2009-04-16 20:16:10 +000013125 p = PyOS_double_to_string(x, type, prec,
13126 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013127 if (p == NULL)
13128 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013130 PyMem_Free(p);
13131 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132}
13133
Tim Peters38fd5b62000-09-21 05:43:11 +000013134static PyObject*
13135formatlong(PyObject *val, int flags, int prec, int type)
13136{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013137 char *buf;
13138 int len;
13139 PyObject *str; /* temporary string object. */
13140 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013141
Benjamin Peterson14339b62009-01-31 16:36:08 +000013142 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13143 if (!str)
13144 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013146 Py_DECREF(str);
13147 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013148}
13149
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013150static Py_UCS4
13151formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013153 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013154 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013155 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013156 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013157 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 goto onError;
13159 }
13160 else {
13161 /* Integer input truncated to a character */
13162 long x;
13163 x = PyLong_AsLong(v);
13164 if (x == -1 && PyErr_Occurred())
13165 goto onError;
13166
13167 if (x < 0 || x > 0x10ffff) {
13168 PyErr_SetString(PyExc_OverflowError,
13169 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013170 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013171 }
13172
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013173 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013174 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013175
Benjamin Peterson29060642009-01-31 22:14:21 +000013176 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013177 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013179 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180}
13181
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013182static int
13183repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13184{
13185 int r;
13186 assert(count > 0);
13187 assert(PyUnicode_Check(obj));
13188 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013189 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013190 if (repeated == NULL)
13191 return -1;
13192 r = _PyAccu_Accumulate(acc, repeated);
13193 Py_DECREF(repeated);
13194 return r;
13195 }
13196 else {
13197 do {
13198 if (_PyAccu_Accumulate(acc, obj))
13199 return -1;
13200 } while (--count);
13201 return 0;
13202 }
13203}
13204
Alexander Belopolsky40018472011-02-26 01:02:56 +000013205PyObject *
13206PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 void *fmt;
13209 int fmtkind;
13210 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013212 int r;
13213 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013216 PyObject *temp = NULL;
13217 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013218 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013219 _PyAccu acc;
13220 static PyObject *plus, *minus, *blank, *zero, *percent;
13221
13222 if (!plus && !(plus = get_latin1_char('+')))
13223 return NULL;
13224 if (!minus && !(minus = get_latin1_char('-')))
13225 return NULL;
13226 if (!blank && !(blank = get_latin1_char(' ')))
13227 return NULL;
13228 if (!zero && !(zero = get_latin1_char('0')))
13229 return NULL;
13230 if (!percent && !(percent = get_latin1_char('%')))
13231 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013232
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013234 PyErr_BadInternalCall();
13235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013237 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013238 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013239 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013240 if (_PyAccu_Init(&acc))
13241 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013242 fmt = PyUnicode_DATA(uformat);
13243 fmtkind = PyUnicode_KIND(uformat);
13244 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13245 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013246
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013248 arglen = PyTuple_Size(args);
13249 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250 }
13251 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013252 arglen = -1;
13253 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013255 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013256 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013257 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258
13259 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013260 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013261 PyObject *nonfmt;
13262 Py_ssize_t nonfmtpos;
13263 nonfmtpos = fmtpos++;
13264 while (fmtcnt >= 0 &&
13265 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13266 fmtpos++;
13267 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013268 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013269 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013270 if (nonfmt == NULL)
13271 goto onError;
13272 r = _PyAccu_Accumulate(&acc, nonfmt);
13273 Py_DECREF(nonfmt);
13274 if (r)
13275 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013276 }
13277 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013278 /* Got a format specifier */
13279 int flags = 0;
13280 Py_ssize_t width = -1;
13281 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013282 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013283 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013284 int isnumok;
13285 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013286 void *pbuf = NULL;
13287 Py_ssize_t pindex, len;
13288 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 fmtpos++;
13291 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13292 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 Py_ssize_t keylen;
13294 PyObject *key;
13295 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013296
Benjamin Peterson29060642009-01-31 22:14:21 +000013297 if (dict == NULL) {
13298 PyErr_SetString(PyExc_TypeError,
13299 "format requires a mapping");
13300 goto onError;
13301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013303 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013304 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013305 /* Skip over balanced parentheses */
13306 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013307 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013308 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013310 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 if (fmtcnt < 0 || pcount > 0) {
13315 PyErr_SetString(PyExc_ValueError,
13316 "incomplete format key");
13317 goto onError;
13318 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013319 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013320 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 if (key == NULL)
13322 goto onError;
13323 if (args_owned) {
13324 Py_DECREF(args);
13325 args_owned = 0;
13326 }
13327 args = PyObject_GetItem(dict, key);
13328 Py_DECREF(key);
13329 if (args == NULL) {
13330 goto onError;
13331 }
13332 args_owned = 1;
13333 arglen = -1;
13334 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013335 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013336 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013337 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013338 case '-': flags |= F_LJUST; continue;
13339 case '+': flags |= F_SIGN; continue;
13340 case ' ': flags |= F_BLANK; continue;
13341 case '#': flags |= F_ALT; continue;
13342 case '0': flags |= F_ZERO; continue;
13343 }
13344 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013345 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013346 if (c == '*') {
13347 v = getnextarg(args, arglen, &argidx);
13348 if (v == NULL)
13349 goto onError;
13350 if (!PyLong_Check(v)) {
13351 PyErr_SetString(PyExc_TypeError,
13352 "* wants int");
13353 goto onError;
13354 }
13355 width = PyLong_AsLong(v);
13356 if (width == -1 && PyErr_Occurred())
13357 goto onError;
13358 if (width < 0) {
13359 flags |= F_LJUST;
13360 width = -width;
13361 }
13362 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013363 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013364 }
13365 else if (c >= '0' && c <= '9') {
13366 width = c - '0';
13367 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013368 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013369 if (c < '0' || c > '9')
13370 break;
13371 if ((width*10) / 10 != width) {
13372 PyErr_SetString(PyExc_ValueError,
13373 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013374 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 }
13376 width = width*10 + (c - '0');
13377 }
13378 }
13379 if (c == '.') {
13380 prec = 0;
13381 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013382 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013383 if (c == '*') {
13384 v = getnextarg(args, arglen, &argidx);
13385 if (v == NULL)
13386 goto onError;
13387 if (!PyLong_Check(v)) {
13388 PyErr_SetString(PyExc_TypeError,
13389 "* wants int");
13390 goto onError;
13391 }
13392 prec = PyLong_AsLong(v);
13393 if (prec == -1 && PyErr_Occurred())
13394 goto onError;
13395 if (prec < 0)
13396 prec = 0;
13397 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013398 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 }
13400 else if (c >= '0' && c <= '9') {
13401 prec = c - '0';
13402 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013403 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 if (c < '0' || c > '9')
13405 break;
13406 if ((prec*10) / 10 != prec) {
13407 PyErr_SetString(PyExc_ValueError,
13408 "prec too big");
13409 goto onError;
13410 }
13411 prec = prec*10 + (c - '0');
13412 }
13413 }
13414 } /* prec */
13415 if (fmtcnt >= 0) {
13416 if (c == 'h' || c == 'l' || c == 'L') {
13417 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013418 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 }
13420 }
13421 if (fmtcnt < 0) {
13422 PyErr_SetString(PyExc_ValueError,
13423 "incomplete format");
13424 goto onError;
13425 }
13426 if (c != '%') {
13427 v = getnextarg(args, arglen, &argidx);
13428 if (v == NULL)
13429 goto onError;
13430 }
13431 sign = 0;
13432 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013433 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 switch (c) {
13435
13436 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013437 _PyAccu_Accumulate(&acc, percent);
13438 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013439
13440 case 's':
13441 case 'r':
13442 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013443 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 temp = v;
13445 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013446 }
13447 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 if (c == 's')
13449 temp = PyObject_Str(v);
13450 else if (c == 'r')
13451 temp = PyObject_Repr(v);
13452 else
13453 temp = PyObject_ASCII(v);
13454 if (temp == NULL)
13455 goto onError;
13456 if (PyUnicode_Check(temp))
13457 /* nothing to do */;
13458 else {
13459 Py_DECREF(temp);
13460 PyErr_SetString(PyExc_TypeError,
13461 "%s argument has non-string str()");
13462 goto onError;
13463 }
13464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013465 if (PyUnicode_READY(temp) == -1) {
13466 Py_CLEAR(temp);
13467 goto onError;
13468 }
13469 pbuf = PyUnicode_DATA(temp);
13470 kind = PyUnicode_KIND(temp);
13471 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 if (prec >= 0 && len > prec)
13473 len = prec;
13474 break;
13475
13476 case 'i':
13477 case 'd':
13478 case 'u':
13479 case 'o':
13480 case 'x':
13481 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013482 isnumok = 0;
13483 if (PyNumber_Check(v)) {
13484 PyObject *iobj=NULL;
13485
13486 if (PyLong_Check(v)) {
13487 iobj = v;
13488 Py_INCREF(iobj);
13489 }
13490 else {
13491 iobj = PyNumber_Long(v);
13492 }
13493 if (iobj!=NULL) {
13494 if (PyLong_Check(iobj)) {
13495 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013496 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013497 Py_DECREF(iobj);
13498 if (!temp)
13499 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 if (PyUnicode_READY(temp) == -1) {
13501 Py_CLEAR(temp);
13502 goto onError;
13503 }
13504 pbuf = PyUnicode_DATA(temp);
13505 kind = PyUnicode_KIND(temp);
13506 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013507 sign = 1;
13508 }
13509 else {
13510 Py_DECREF(iobj);
13511 }
13512 }
13513 }
13514 if (!isnumok) {
13515 PyErr_Format(PyExc_TypeError,
13516 "%%%c format: a number is required, "
13517 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13518 goto onError;
13519 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013520 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013522 fillobj = zero;
13523 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013524 break;
13525
13526 case 'e':
13527 case 'E':
13528 case 'f':
13529 case 'F':
13530 case 'g':
13531 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013532 temp = formatfloat(v, flags, prec, c);
13533 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013534 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013535 if (PyUnicode_READY(temp) == -1) {
13536 Py_CLEAR(temp);
13537 goto onError;
13538 }
13539 pbuf = PyUnicode_DATA(temp);
13540 kind = PyUnicode_KIND(temp);
13541 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013543 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013545 fillobj = zero;
13546 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013547 break;
13548
13549 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013550 {
13551 Py_UCS4 ch = formatchar(v);
13552 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013553 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013554 temp = _PyUnicode_FromUCS4(&ch, 1);
13555 if (temp == NULL)
13556 goto onError;
13557 pbuf = PyUnicode_DATA(temp);
13558 kind = PyUnicode_KIND(temp);
13559 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013561 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013562
13563 default:
13564 PyErr_Format(PyExc_ValueError,
13565 "unsupported format character '%c' (0x%x) "
13566 "at index %zd",
13567 (31<=c && c<=126) ? (char)c : '?',
13568 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013569 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013570 goto onError;
13571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013572 /* pbuf is initialized here. */
13573 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013575 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13576 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013578 pindex++;
13579 }
13580 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13581 signobj = plus;
13582 len--;
13583 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 }
13585 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013586 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013587 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013588 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013589 else
13590 sign = 0;
13591 }
13592 if (width < len)
13593 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013595 if (fill != ' ') {
13596 assert(signobj != NULL);
13597 if (_PyAccu_Accumulate(&acc, signobj))
13598 goto onError;
13599 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013600 if (width > len)
13601 width--;
13602 }
13603 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013604 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013605 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013606 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013607 second = get_latin1_char(
13608 PyUnicode_READ(kind, pbuf, pindex + 1));
13609 pindex += 2;
13610 if (second == NULL ||
13611 _PyAccu_Accumulate(&acc, zero) ||
13612 _PyAccu_Accumulate(&acc, second))
13613 goto onError;
13614 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013615 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013616 width -= 2;
13617 if (width < 0)
13618 width = 0;
13619 len -= 2;
13620 }
13621 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013622 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013623 if (repeat_accumulate(&acc, fillobj, width - len))
13624 goto onError;
13625 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013626 }
13627 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013628 if (sign) {
13629 assert(signobj != NULL);
13630 if (_PyAccu_Accumulate(&acc, signobj))
13631 goto onError;
13632 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013633 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013634 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13635 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013636 second = get_latin1_char(
13637 PyUnicode_READ(kind, pbuf, pindex + 1));
13638 pindex += 2;
13639 if (second == NULL ||
13640 _PyAccu_Accumulate(&acc, zero) ||
13641 _PyAccu_Accumulate(&acc, second))
13642 goto onError;
13643 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013644 }
13645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013646 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013647 if (temp != NULL) {
13648 assert(pbuf == PyUnicode_DATA(temp));
13649 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013650 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013651 else {
13652 const char *p = (const char *) pbuf;
13653 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013654 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013655 v = PyUnicode_FromKindAndData(kind, p, len);
13656 }
13657 if (v == NULL)
13658 goto onError;
13659 r = _PyAccu_Accumulate(&acc, v);
13660 Py_DECREF(v);
13661 if (r)
13662 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013663 if (width > len && repeat_accumulate(&acc, blank, width - len))
13664 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013665 if (dict && (argidx < arglen) && c != '%') {
13666 PyErr_SetString(PyExc_TypeError,
13667 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013668 goto onError;
13669 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013670 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013672 } /* until end */
13673 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013674 PyErr_SetString(PyExc_TypeError,
13675 "not all arguments converted during string formatting");
13676 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013677 }
13678
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013679 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013680 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013681 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013682 }
13683 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013684 Py_XDECREF(temp);
13685 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013686 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013687
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013689 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013690 Py_XDECREF(temp);
13691 Py_XDECREF(second);
13692 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013693 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013694 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013695 }
13696 return NULL;
13697}
13698
Jeremy Hylton938ace62002-07-17 16:30:39 +000013699static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013700unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13701
Tim Peters6d6c1a32001-08-02 04:15:00 +000013702static PyObject *
13703unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13704{
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013706 static char *kwlist[] = {"object", "encoding", "errors", 0};
13707 char *encoding = NULL;
13708 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013709
Benjamin Peterson14339b62009-01-31 16:36:08 +000013710 if (type != &PyUnicode_Type)
13711 return unicode_subtype_new(type, args, kwds);
13712 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013714 return NULL;
13715 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013716 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013717 if (encoding == NULL && errors == NULL)
13718 return PyObject_Str(x);
13719 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013720 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013721}
13722
Guido van Rossume023fe02001-08-30 03:12:59 +000013723static PyObject *
13724unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13725{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013726 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013727 Py_ssize_t length, char_size;
13728 int share_wstr, share_utf8;
13729 unsigned int kind;
13730 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013731
Benjamin Peterson14339b62009-01-31 16:36:08 +000013732 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013733
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013734 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013735 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013736 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013737 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013738 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013739 return NULL;
13740
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013741 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013742 if (self == NULL) {
13743 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013744 return NULL;
13745 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013746 kind = PyUnicode_KIND(unicode);
13747 length = PyUnicode_GET_LENGTH(unicode);
13748
13749 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013750#ifdef Py_DEBUG
13751 _PyUnicode_HASH(self) = -1;
13752#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013753 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013754#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013755 _PyUnicode_STATE(self).interned = 0;
13756 _PyUnicode_STATE(self).kind = kind;
13757 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013758 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013759 _PyUnicode_STATE(self).ready = 1;
13760 _PyUnicode_WSTR(self) = NULL;
13761 _PyUnicode_UTF8_LENGTH(self) = 0;
13762 _PyUnicode_UTF8(self) = NULL;
13763 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013764 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013765
13766 share_utf8 = 0;
13767 share_wstr = 0;
13768 if (kind == PyUnicode_1BYTE_KIND) {
13769 char_size = 1;
13770 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13771 share_utf8 = 1;
13772 }
13773 else if (kind == PyUnicode_2BYTE_KIND) {
13774 char_size = 2;
13775 if (sizeof(wchar_t) == 2)
13776 share_wstr = 1;
13777 }
13778 else {
13779 assert(kind == PyUnicode_4BYTE_KIND);
13780 char_size = 4;
13781 if (sizeof(wchar_t) == 4)
13782 share_wstr = 1;
13783 }
13784
13785 /* Ensure we won't overflow the length. */
13786 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13787 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013788 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013789 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013790 data = PyObject_MALLOC((length + 1) * char_size);
13791 if (data == NULL) {
13792 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013793 goto onError;
13794 }
13795
Victor Stinnerc3c74152011-10-02 20:39:55 +020013796 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013797 if (share_utf8) {
13798 _PyUnicode_UTF8_LENGTH(self) = length;
13799 _PyUnicode_UTF8(self) = data;
13800 }
13801 if (share_wstr) {
13802 _PyUnicode_WSTR_LENGTH(self) = length;
13803 _PyUnicode_WSTR(self) = (wchar_t *)data;
13804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013805
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013806 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013807 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013808 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013809#ifdef Py_DEBUG
13810 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13811#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013812 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013813 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013814
13815onError:
13816 Py_DECREF(unicode);
13817 Py_DECREF(self);
13818 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013819}
13820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013821PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013822 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013823\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013824Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013825encoding defaults to the current default string encoding.\n\
13826errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013827
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013828static PyObject *unicode_iter(PyObject *seq);
13829
Guido van Rossumd57fd912000-03-10 22:53:23 +000013830PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013831 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013832 "str", /* tp_name */
13833 sizeof(PyUnicodeObject), /* tp_size */
13834 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013836 (destructor)unicode_dealloc, /* tp_dealloc */
13837 0, /* tp_print */
13838 0, /* tp_getattr */
13839 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013840 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013841 unicode_repr, /* tp_repr */
13842 &unicode_as_number, /* tp_as_number */
13843 &unicode_as_sequence, /* tp_as_sequence */
13844 &unicode_as_mapping, /* tp_as_mapping */
13845 (hashfunc) unicode_hash, /* tp_hash*/
13846 0, /* tp_call*/
13847 (reprfunc) unicode_str, /* tp_str */
13848 PyObject_GenericGetAttr, /* tp_getattro */
13849 0, /* tp_setattro */
13850 0, /* tp_as_buffer */
13851 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013852 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013853 unicode_doc, /* tp_doc */
13854 0, /* tp_traverse */
13855 0, /* tp_clear */
13856 PyUnicode_RichCompare, /* tp_richcompare */
13857 0, /* tp_weaklistoffset */
13858 unicode_iter, /* tp_iter */
13859 0, /* tp_iternext */
13860 unicode_methods, /* tp_methods */
13861 0, /* tp_members */
13862 0, /* tp_getset */
13863 &PyBaseObject_Type, /* tp_base */
13864 0, /* tp_dict */
13865 0, /* tp_descr_get */
13866 0, /* tp_descr_set */
13867 0, /* tp_dictoffset */
13868 0, /* tp_init */
13869 0, /* tp_alloc */
13870 unicode_new, /* tp_new */
13871 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013872};
13873
13874/* Initialize the Unicode implementation */
13875
Victor Stinner3a50e702011-10-18 21:21:00 +020013876int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013877{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013878 int i;
13879
Thomas Wouters477c8d52006-05-27 19:21:47 +000013880 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013881 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013882 0x000A, /* LINE FEED */
13883 0x000D, /* CARRIAGE RETURN */
13884 0x001C, /* FILE SEPARATOR */
13885 0x001D, /* GROUP SEPARATOR */
13886 0x001E, /* RECORD SEPARATOR */
13887 0x0085, /* NEXT LINE */
13888 0x2028, /* LINE SEPARATOR */
13889 0x2029, /* PARAGRAPH SEPARATOR */
13890 };
13891
Fred Drakee4315f52000-05-09 19:53:39 +000013892 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013893 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013894 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013895 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013896 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013897
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013898 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013899 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013900 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013901 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013902
13903 /* initialize the linebreak bloom filter */
13904 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013905 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013906 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013907
13908 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013909
13910#ifdef HAVE_MBCS
13911 winver.dwOSVersionInfoSize = sizeof(winver);
13912 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13913 PyErr_SetFromWindowsErr(0);
13914 return -1;
13915 }
13916#endif
13917 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013918}
13919
13920/* Finalize the Unicode implementation */
13921
Christian Heimesa156e092008-02-16 07:38:31 +000013922int
13923PyUnicode_ClearFreeList(void)
13924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013925 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013926}
13927
Guido van Rossumd57fd912000-03-10 22:53:23 +000013928void
Thomas Wouters78890102000-07-22 19:25:51 +000013929_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013930{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013931 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013932
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013933 Py_XDECREF(unicode_empty);
13934 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013935
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013936 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013937 if (unicode_latin1[i]) {
13938 Py_DECREF(unicode_latin1[i]);
13939 unicode_latin1[i] = NULL;
13940 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013941 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013942 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013943 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013944}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013945
Walter Dörwald16807132007-05-25 13:52:07 +000013946void
13947PyUnicode_InternInPlace(PyObject **p)
13948{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013949 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013950 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013951#ifdef Py_DEBUG
13952 assert(s != NULL);
13953 assert(_PyUnicode_CHECK(s));
13954#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013955 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013956 return;
13957#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 /* If it's a subclass, we don't really know what putting
13959 it in the interned dict might do. */
13960 if (!PyUnicode_CheckExact(s))
13961 return;
13962 if (PyUnicode_CHECK_INTERNED(s))
13963 return;
13964 if (interned == NULL) {
13965 interned = PyDict_New();
13966 if (interned == NULL) {
13967 PyErr_Clear(); /* Don't leave an exception */
13968 return;
13969 }
13970 }
13971 /* It might be that the GetItem call fails even
13972 though the key is present in the dictionary,
13973 namely when this happens during a stack overflow. */
13974 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013975 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013977
Benjamin Peterson29060642009-01-31 22:14:21 +000013978 if (t) {
13979 Py_INCREF(t);
13980 Py_DECREF(*p);
13981 *p = t;
13982 return;
13983 }
Walter Dörwald16807132007-05-25 13:52:07 +000013984
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013986 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013987 PyErr_Clear();
13988 PyThreadState_GET()->recursion_critical = 0;
13989 return;
13990 }
13991 PyThreadState_GET()->recursion_critical = 0;
13992 /* The two references in interned are not counted by refcnt.
13993 The deallocator will take care of this */
13994 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013995 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013996}
13997
13998void
13999PyUnicode_InternImmortal(PyObject **p)
14000{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014001 PyUnicode_InternInPlace(p);
14002 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014003 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014004 Py_INCREF(*p);
14005 }
Walter Dörwald16807132007-05-25 13:52:07 +000014006}
14007
14008PyObject *
14009PyUnicode_InternFromString(const char *cp)
14010{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014011 PyObject *s = PyUnicode_FromString(cp);
14012 if (s == NULL)
14013 return NULL;
14014 PyUnicode_InternInPlace(&s);
14015 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014016}
14017
Alexander Belopolsky40018472011-02-26 01:02:56 +000014018void
14019_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014020{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014021 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014022 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014023 Py_ssize_t i, n;
14024 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014025
Benjamin Peterson14339b62009-01-31 16:36:08 +000014026 if (interned == NULL || !PyDict_Check(interned))
14027 return;
14028 keys = PyDict_Keys(interned);
14029 if (keys == NULL || !PyList_Check(keys)) {
14030 PyErr_Clear();
14031 return;
14032 }
Walter Dörwald16807132007-05-25 13:52:07 +000014033
Benjamin Peterson14339b62009-01-31 16:36:08 +000014034 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14035 detector, interned unicode strings are not forcibly deallocated;
14036 rather, we give them their stolen references back, and then clear
14037 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014038
Benjamin Peterson14339b62009-01-31 16:36:08 +000014039 n = PyList_GET_SIZE(keys);
14040 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014041 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014043 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014044 if (PyUnicode_READY(s) == -1) {
14045 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014046 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014048 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 case SSTATE_NOT_INTERNED:
14050 /* XXX Shouldn't happen */
14051 break;
14052 case SSTATE_INTERNED_IMMORTAL:
14053 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014054 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014055 break;
14056 case SSTATE_INTERNED_MORTAL:
14057 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014058 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014059 break;
14060 default:
14061 Py_FatalError("Inconsistent interned string state.");
14062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014063 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014064 }
14065 fprintf(stderr, "total size of all interned strings: "
14066 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14067 "mortal/immortal\n", mortal_size, immortal_size);
14068 Py_DECREF(keys);
14069 PyDict_Clear(interned);
14070 Py_DECREF(interned);
14071 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014072}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014073
14074
14075/********************* Unicode Iterator **************************/
14076
14077typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014078 PyObject_HEAD
14079 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014080 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014081} unicodeiterobject;
14082
14083static void
14084unicodeiter_dealloc(unicodeiterobject *it)
14085{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014086 _PyObject_GC_UNTRACK(it);
14087 Py_XDECREF(it->it_seq);
14088 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014089}
14090
14091static int
14092unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14093{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014094 Py_VISIT(it->it_seq);
14095 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014096}
14097
14098static PyObject *
14099unicodeiter_next(unicodeiterobject *it)
14100{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014101 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014102
Benjamin Peterson14339b62009-01-31 16:36:08 +000014103 assert(it != NULL);
14104 seq = it->it_seq;
14105 if (seq == NULL)
14106 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014107 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014109 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14110 int kind = PyUnicode_KIND(seq);
14111 void *data = PyUnicode_DATA(seq);
14112 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14113 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014114 if (item != NULL)
14115 ++it->it_index;
14116 return item;
14117 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014118
Benjamin Peterson14339b62009-01-31 16:36:08 +000014119 Py_DECREF(seq);
14120 it->it_seq = NULL;
14121 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014122}
14123
14124static PyObject *
14125unicodeiter_len(unicodeiterobject *it)
14126{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014127 Py_ssize_t len = 0;
14128 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014129 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014130 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014131}
14132
14133PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14134
14135static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014136 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014137 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014138 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014139};
14140
14141PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14143 "str_iterator", /* tp_name */
14144 sizeof(unicodeiterobject), /* tp_basicsize */
14145 0, /* tp_itemsize */
14146 /* methods */
14147 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14148 0, /* tp_print */
14149 0, /* tp_getattr */
14150 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014151 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014152 0, /* tp_repr */
14153 0, /* tp_as_number */
14154 0, /* tp_as_sequence */
14155 0, /* tp_as_mapping */
14156 0, /* tp_hash */
14157 0, /* tp_call */
14158 0, /* tp_str */
14159 PyObject_GenericGetAttr, /* tp_getattro */
14160 0, /* tp_setattro */
14161 0, /* tp_as_buffer */
14162 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14163 0, /* tp_doc */
14164 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14165 0, /* tp_clear */
14166 0, /* tp_richcompare */
14167 0, /* tp_weaklistoffset */
14168 PyObject_SelfIter, /* tp_iter */
14169 (iternextfunc)unicodeiter_next, /* tp_iternext */
14170 unicodeiter_methods, /* tp_methods */
14171 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014172};
14173
14174static PyObject *
14175unicode_iter(PyObject *seq)
14176{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014177 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014178
Benjamin Peterson14339b62009-01-31 16:36:08 +000014179 if (!PyUnicode_Check(seq)) {
14180 PyErr_BadInternalCall();
14181 return NULL;
14182 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014183 if (PyUnicode_READY(seq) == -1)
14184 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014185 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14186 if (it == NULL)
14187 return NULL;
14188 it->it_index = 0;
14189 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014190 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014191 _PyObject_GC_TRACK(it);
14192 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014193}
14194
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014195
14196size_t
14197Py_UNICODE_strlen(const Py_UNICODE *u)
14198{
14199 int res = 0;
14200 while(*u++)
14201 res++;
14202 return res;
14203}
14204
14205Py_UNICODE*
14206Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14207{
14208 Py_UNICODE *u = s1;
14209 while ((*u++ = *s2++));
14210 return s1;
14211}
14212
14213Py_UNICODE*
14214Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14215{
14216 Py_UNICODE *u = s1;
14217 while ((*u++ = *s2++))
14218 if (n-- == 0)
14219 break;
14220 return s1;
14221}
14222
14223Py_UNICODE*
14224Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14225{
14226 Py_UNICODE *u1 = s1;
14227 u1 += Py_UNICODE_strlen(u1);
14228 Py_UNICODE_strcpy(u1, s2);
14229 return s1;
14230}
14231
14232int
14233Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14234{
14235 while (*s1 && *s2 && *s1 == *s2)
14236 s1++, s2++;
14237 if (*s1 && *s2)
14238 return (*s1 < *s2) ? -1 : +1;
14239 if (*s1)
14240 return 1;
14241 if (*s2)
14242 return -1;
14243 return 0;
14244}
14245
14246int
14247Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14248{
14249 register Py_UNICODE u1, u2;
14250 for (; n != 0; n--) {
14251 u1 = *s1;
14252 u2 = *s2;
14253 if (u1 != u2)
14254 return (u1 < u2) ? -1 : +1;
14255 if (u1 == '\0')
14256 return 0;
14257 s1++;
14258 s2++;
14259 }
14260 return 0;
14261}
14262
14263Py_UNICODE*
14264Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14265{
14266 const Py_UNICODE *p;
14267 for (p = s; *p; p++)
14268 if (*p == c)
14269 return (Py_UNICODE*)p;
14270 return NULL;
14271}
14272
14273Py_UNICODE*
14274Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14275{
14276 const Py_UNICODE *p;
14277 p = s + Py_UNICODE_strlen(s);
14278 while (p != s) {
14279 p--;
14280 if (*p == c)
14281 return (Py_UNICODE*)p;
14282 }
14283 return NULL;
14284}
Victor Stinner331ea922010-08-10 16:37:20 +000014285
Victor Stinner71133ff2010-09-01 23:43:53 +000014286Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014287PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014288{
Victor Stinner577db2c2011-10-11 22:12:48 +020014289 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014290 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014292 if (!PyUnicode_Check(unicode)) {
14293 PyErr_BadArgument();
14294 return NULL;
14295 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014296 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014297 if (u == NULL)
14298 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014299 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014300 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014301 PyErr_NoMemory();
14302 return NULL;
14303 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014304 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014305 size *= sizeof(Py_UNICODE);
14306 copy = PyMem_Malloc(size);
14307 if (copy == NULL) {
14308 PyErr_NoMemory();
14309 return NULL;
14310 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014311 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014312 return copy;
14313}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014314
Georg Brandl66c221e2010-10-14 07:04:07 +000014315/* A _string module, to export formatter_parser and formatter_field_name_split
14316 to the string.Formatter class implemented in Python. */
14317
14318static PyMethodDef _string_methods[] = {
14319 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14320 METH_O, PyDoc_STR("split the argument as a field name")},
14321 {"formatter_parser", (PyCFunction) formatter_parser,
14322 METH_O, PyDoc_STR("parse the argument as a format string")},
14323 {NULL, NULL}
14324};
14325
14326static struct PyModuleDef _string_module = {
14327 PyModuleDef_HEAD_INIT,
14328 "_string",
14329 PyDoc_STR("string helper module"),
14330 0,
14331 _string_methods,
14332 NULL,
14333 NULL,
14334 NULL,
14335 NULL
14336};
14337
14338PyMODINIT_FUNC
14339PyInit__string(void)
14340{
14341 return PyModule_Create(&_string_module);
14342}
14343
14344
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014345#ifdef __cplusplus
14346}
14347#endif