blob: e7155677600cfd6c2fc285a9b0502a5d0e43ab35 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner910337b2011-10-03 03:20:16 +020069#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020070# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020071#else
72# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
73#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020074
Victor Stinnere90fe6a2011-10-01 16:48:13 +020075#define _PyUnicode_UTF8(op) \
76 (((PyCompactUnicodeObject*)(op))->utf8)
77#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020078 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079 assert(PyUnicode_IS_READY(op)), \
80 PyUnicode_IS_COMPACT_ASCII(op) ? \
81 ((char*)((PyASCIIObject*)(op) + 1)) : \
82 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020083#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 (((PyCompactUnicodeObject*)(op))->utf8_length)
85#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020086 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 assert(PyUnicode_IS_READY(op)), \
88 PyUnicode_IS_COMPACT_ASCII(op) ? \
89 ((PyASCIIObject*)(op))->length : \
90 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020091#define _PyUnicode_WSTR(op) \
92 (((PyASCIIObject*)(op))->wstr)
93#define _PyUnicode_WSTR_LENGTH(op) \
94 (((PyCompactUnicodeObject*)(op))->wstr_length)
95#define _PyUnicode_LENGTH(op) \
96 (((PyASCIIObject *)(op))->length)
97#define _PyUnicode_STATE(op) \
98 (((PyASCIIObject *)(op))->state)
99#define _PyUnicode_HASH(op) \
100 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200101#define _PyUnicode_KIND(op) \
102 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200103 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_GET_LENGTH(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200107#define _PyUnicode_DATA_ANY(op) \
108 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109
Victor Stinner910337b2011-10-03 03:20:16 +0200110#undef PyUnicode_READY
111#define PyUnicode_READY(op) \
112 (assert(_PyUnicode_CHECK(op)), \
113 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200114 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100115 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200116
Victor Stinnerc379ead2011-10-03 12:52:27 +0200117#define _PyUnicode_SHARE_UTF8(op) \
118 (assert(_PyUnicode_CHECK(op)), \
119 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
120 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
121#define _PyUnicode_SHARE_WSTR(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
124
Victor Stinner829c0ad2011-10-03 01:08:02 +0200125/* true if the Unicode object has an allocated UTF-8 memory block
126 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200127#define _PyUnicode_HAS_UTF8_MEMORY(op) \
128 (assert(_PyUnicode_CHECK(op)), \
129 (!PyUnicode_IS_COMPACT_ASCII(op) \
130 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200131 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
132
Victor Stinner03490912011-10-03 23:45:12 +0200133/* true if the Unicode object has an allocated wstr memory block
134 (not shared with other data) */
135#define _PyUnicode_HAS_WSTR_MEMORY(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 (_PyUnicode_WSTR(op) && \
138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200148 to_type *_to = (to_type *) to; \
149 const from_type *_iter = (begin); \
150 const from_type *_end = (end); \
151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
153 _iter + (n & ~ (Py_ssize_t) 3); \
154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200165/* The Unicode string has been modified: reset the hash */
166#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
167
Walter Dörwald16807132007-05-25 13:52:07 +0000168/* This dictionary holds all interned unicode strings. Note that references
169 to strings in this dictionary are *not* counted in the string's ob_refcnt.
170 When the interned string reaches a refcnt of 0 the string deallocation
171 function will delete the reference from this dictionary.
172
173 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000174 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000175*/
176static PyObject *interned;
177
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000178/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200179static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000180
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200181/* List of static strings. */
182static _Py_Identifier *static_strings;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* Single character Unicode strings in the Latin-1 range are being
185 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200186static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187
Christian Heimes190d79e2008-01-30 11:58:22 +0000188/* Fast detection of the most frequent whitespace characters */
189const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000190 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000191/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000192/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000193/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000194/* case 0x000C: * FORM FEED */
195/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 1, 1, 1, 1, 1, 0, 0,
197 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x001C: * FILE SEPARATOR */
199/* case 0x001D: * GROUP SEPARATOR */
200/* case 0x001E: * RECORD SEPARATOR */
201/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000203/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000204 1, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000208
Benjamin Peterson14339b62009-01-31 16:36:08 +0000209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000217};
218
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200219/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200220static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200221static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200222static void copy_characters(
223 PyObject *to, Py_ssize_t to_start,
224 PyObject *from, Py_ssize_t from_start,
225 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226
Alexander Belopolsky40018472011-02-26 01:02:56 +0000227static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200228unicode_fromascii(const unsigned char *s, Py_ssize_t size);
229static PyObject *
230_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
231static PyObject *
232_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
235
236static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000237unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000238 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100239 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
241
Alexander Belopolsky40018472011-02-26 01:02:56 +0000242static void
243raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300244 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100245 PyObject *unicode,
246 Py_ssize_t startpos, Py_ssize_t endpos,
247 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000248
Christian Heimes190d79e2008-01-30 11:58:22 +0000249/* Same for linebreaks */
250static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000251 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000252/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000253/* 0x000B, * LINE TABULATION */
254/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000257 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x001C, * FILE SEPARATOR */
259/* 0x001D, * GROUP SEPARATOR */
260/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000261 0, 0, 0, 0, 1, 1, 1, 0,
262 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000266
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000275};
276
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300277/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
278 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000279Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000280PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000282#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 /* This is actually an illegal character, so it should
286 not be passed to unichr. */
287 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288#endif
289}
290
Victor Stinner910337b2011-10-03 03:20:16 +0200291#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200292int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100293_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200294{
295 PyASCIIObject *ascii;
296 unsigned int kind;
297
298 assert(PyUnicode_Check(op));
299
300 ascii = (PyASCIIObject *)op;
301 kind = ascii->state.kind;
302
Victor Stinnera3b334d2011-10-03 13:53:37 +0200303 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200304 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200305 assert(ascii->state.ready == 1);
306 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200307 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200308 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200309 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200310
Victor Stinnera41463c2011-10-04 01:05:08 +0200311 if (ascii->state.compact == 1) {
312 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(kind == PyUnicode_1BYTE_KIND
314 || kind == PyUnicode_2BYTE_KIND
315 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200316 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200317 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100319 }
320 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
322
323 data = unicode->data.any;
324 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100325 assert(ascii->length == 0);
326 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 assert(ascii->state.compact == 0);
328 assert(ascii->state.ascii == 0);
329 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100330 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200331 assert(ascii->wstr != NULL);
332 assert(data == NULL);
333 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 }
335 else {
336 assert(kind == PyUnicode_1BYTE_KIND
337 || kind == PyUnicode_2BYTE_KIND
338 || kind == PyUnicode_4BYTE_KIND);
339 assert(ascii->state.compact == 0);
340 assert(ascii->state.ready == 1);
341 assert(data != NULL);
342 if (ascii->state.ascii) {
343 assert (compact->utf8 == data);
344 assert (compact->utf8_length == ascii->length);
345 }
346 else
347 assert (compact->utf8 != data);
348 }
349 }
350 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200351 if (
352#if SIZEOF_WCHAR_T == 2
353 kind == PyUnicode_2BYTE_KIND
354#else
355 kind == PyUnicode_4BYTE_KIND
356#endif
357 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200358 {
359 assert(ascii->wstr == data);
360 assert(compact->wstr_length == ascii->length);
361 } else
362 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200363 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200364
365 if (compact->utf8 == NULL)
366 assert(compact->utf8_length == 0);
367 if (ascii->wstr == NULL)
368 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200369 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200370 /* check that the best kind is used */
371 if (check_content && kind != PyUnicode_WCHAR_KIND)
372 {
373 Py_ssize_t i;
374 Py_UCS4 maxchar = 0;
375 void *data = PyUnicode_DATA(ascii);
376 for (i=0; i < ascii->length; i++)
377 {
378 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
379 if (ch > maxchar)
380 maxchar = ch;
381 }
Victor Stinnerda29cc32011-11-21 14:31:41 +0100382 if (maxchar > 0x10FFFF) {
383 printf("Invalid Unicode string! {");
384 for (i=0; i < ascii->length; i++)
385 {
386 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
387 if (i)
388 printf(", U+%04x", ch);
389 else
390 printf("U+%04x", ch);
391 }
Victor Stinner5bbe5e72011-11-21 22:54:05 +0100392 printf("} (len=%lu)\n", ascii->length);
Victor Stinnerda29cc32011-11-21 14:31:41 +0100393 abort();
394 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100396 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100398 assert(maxchar <= 255);
399 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 else
401 assert(maxchar < 128);
402 }
Victor Stinner77faf692011-11-20 18:56:05 +0100403 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100405 assert(maxchar <= 0xFFFF);
406 }
407 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 assert(maxchar >= 0x10000);
Victor Stinner77faf692011-11-20 18:56:05 +0100409 assert(maxchar <= 0x10FFFF);
410 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200411 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400412 return 1;
413}
Victor Stinner910337b2011-10-03 03:20:16 +0200414#endif
415
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100416static PyObject*
417unicode_result_wchar(PyObject *unicode)
418{
419#ifndef Py_DEBUG
420 Py_ssize_t len;
421
422 assert(Py_REFCNT(unicode) == 1);
423
424 len = _PyUnicode_WSTR_LENGTH(unicode);
425 if (len == 0) {
426 Py_INCREF(unicode_empty);
427 Py_DECREF(unicode);
428 return unicode_empty;
429 }
430
431 if (len == 1) {
432 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
433 if (ch < 256) {
434 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
435 Py_DECREF(unicode);
436 return latin1_char;
437 }
438 }
439
440 if (_PyUnicode_Ready(unicode) < 0) {
441 Py_XDECREF(unicode);
442 return NULL;
443 }
444#else
445 /* don't make the result ready in debug mode to ensure that the caller
446 makes the string ready before using it */
447 assert(_PyUnicode_CheckConsistency(unicode, 1));
448#endif
449 return unicode;
450}
451
452static PyObject*
453unicode_result_ready(PyObject *unicode)
454{
455 Py_ssize_t length;
456
457 length = PyUnicode_GET_LENGTH(unicode);
458 if (length == 0) {
459 if (unicode != unicode_empty) {
460 Py_INCREF(unicode_empty);
461 Py_DECREF(unicode);
462 }
463 return unicode_empty;
464 }
465
466 if (length == 1) {
467 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
468 if (ch < 256) {
469 PyObject *latin1_char = unicode_latin1[ch];
470 if (latin1_char != NULL) {
471 if (unicode != latin1_char) {
472 Py_INCREF(latin1_char);
473 Py_DECREF(unicode);
474 }
475 return latin1_char;
476 }
477 else {
478 assert(_PyUnicode_CheckConsistency(unicode, 1));
479 Py_INCREF(unicode);
480 unicode_latin1[ch] = unicode;
481 return unicode;
482 }
483 }
484 }
485
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 return unicode;
488}
489
490static PyObject*
491unicode_result(PyObject *unicode)
492{
493 assert(_PyUnicode_CHECK(unicode));
494 if (PyUnicode_IS_READY(unicode))
495 return unicode_result_ready(unicode);
496 else
497 return unicode_result_wchar(unicode);
498}
499
Victor Stinner3a50e702011-10-18 21:21:00 +0200500#ifdef HAVE_MBCS
501static OSVERSIONINFOEX winver;
502#endif
503
Thomas Wouters477c8d52006-05-27 19:21:47 +0000504/* --- Bloom Filters ----------------------------------------------------- */
505
506/* stuff to implement simple "bloom filters" for Unicode characters.
507 to keep things simple, we use a single bitmask, using the least 5
508 bits from each unicode characters as the bit index. */
509
510/* the linebreak mask is set up by Unicode_Init below */
511
Antoine Pitrouf068f942010-01-13 14:19:12 +0000512#if LONG_BIT >= 128
513#define BLOOM_WIDTH 128
514#elif LONG_BIT >= 64
515#define BLOOM_WIDTH 64
516#elif LONG_BIT >= 32
517#define BLOOM_WIDTH 32
518#else
519#error "LONG_BIT is smaller than 32"
520#endif
521
Thomas Wouters477c8d52006-05-27 19:21:47 +0000522#define BLOOM_MASK unsigned long
523
524static BLOOM_MASK bloom_linebreak;
525
Antoine Pitrouf068f942010-01-13 14:19:12 +0000526#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
527#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000528
Benjamin Peterson29060642009-01-31 22:14:21 +0000529#define BLOOM_LINEBREAK(ch) \
530 ((ch) < 128U ? ascii_linebreak[(ch)] : \
531 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000532
Alexander Belopolsky40018472011-02-26 01:02:56 +0000533Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200534make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535{
536 /* calculate simple bloom-style bitmask for a given unicode string */
537
Antoine Pitrouf068f942010-01-13 14:19:12 +0000538 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539 Py_ssize_t i;
540
541 mask = 0;
542 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544
545 return mask;
546}
547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200548#define BLOOM_MEMBER(mask, chr, str) \
549 (BLOOM(mask, chr) \
550 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200552/* Compilation of templated routines */
553
554#include "stringlib/asciilib.h"
555#include "stringlib/fastsearch.h"
556#include "stringlib/partition.h"
557#include "stringlib/split.h"
558#include "stringlib/count.h"
559#include "stringlib/find.h"
560#include "stringlib/find_max_char.h"
561#include "stringlib/localeutil.h"
562#include "stringlib/undef.h"
563
564#include "stringlib/ucs1lib.h"
565#include "stringlib/fastsearch.h"
566#include "stringlib/partition.h"
567#include "stringlib/split.h"
568#include "stringlib/count.h"
569#include "stringlib/find.h"
570#include "stringlib/find_max_char.h"
571#include "stringlib/localeutil.h"
572#include "stringlib/undef.h"
573
574#include "stringlib/ucs2lib.h"
575#include "stringlib/fastsearch.h"
576#include "stringlib/partition.h"
577#include "stringlib/split.h"
578#include "stringlib/count.h"
579#include "stringlib/find.h"
580#include "stringlib/find_max_char.h"
581#include "stringlib/localeutil.h"
582#include "stringlib/undef.h"
583
584#include "stringlib/ucs4lib.h"
585#include "stringlib/fastsearch.h"
586#include "stringlib/partition.h"
587#include "stringlib/split.h"
588#include "stringlib/count.h"
589#include "stringlib/find.h"
590#include "stringlib/find_max_char.h"
591#include "stringlib/localeutil.h"
592#include "stringlib/undef.h"
593
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200594#include "stringlib/unicodedefs.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100598#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200599
Guido van Rossumd57fd912000-03-10 22:53:23 +0000600/* --- Unicode Object ----------------------------------------------------- */
601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200602static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200603fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200604
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200605Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
606 Py_ssize_t size, Py_UCS4 ch,
607 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200608{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200609 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
610
611 switch (kind) {
612 case PyUnicode_1BYTE_KIND:
613 {
614 Py_UCS1 ch1 = (Py_UCS1) ch;
615 if (ch1 == ch)
616 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
617 else
618 return -1;
619 }
620 case PyUnicode_2BYTE_KIND:
621 {
622 Py_UCS2 ch2 = (Py_UCS2) ch;
623 if (ch2 == ch)
624 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
625 else
626 return -1;
627 }
628 case PyUnicode_4BYTE_KIND:
629 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
630 default:
631 assert(0);
632 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634}
635
Victor Stinnerfe226c02011-10-03 03:52:20 +0200636static PyObject*
637resize_compact(PyObject *unicode, Py_ssize_t length)
638{
639 Py_ssize_t char_size;
640 Py_ssize_t struct_size;
641 Py_ssize_t new_size;
642 int share_wstr;
643
644 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200645 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200646 if (PyUnicode_IS_COMPACT_ASCII(unicode))
647 struct_size = sizeof(PyASCIIObject);
648 else
649 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200650 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200651
652 _Py_DEC_REFTOTAL;
653 _Py_ForgetReference(unicode);
654
655 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
656 PyErr_NoMemory();
657 return NULL;
658 }
659 new_size = (struct_size + (length + 1) * char_size);
660
661 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
662 if (unicode == NULL) {
663 PyObject_Del(unicode);
664 PyErr_NoMemory();
665 return NULL;
666 }
667 _Py_NewReference(unicode);
668 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200669 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200670 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200671 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
672 _PyUnicode_WSTR_LENGTH(unicode) = length;
673 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
675 length, 0);
676 return unicode;
677}
678
Alexander Belopolsky40018472011-02-26 01:02:56 +0000679static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200680resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681{
Victor Stinner95663112011-10-04 01:03:50 +0200682 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200683 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000685
Victor Stinner95663112011-10-04 01:03:50 +0200686 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200687
688 if (PyUnicode_IS_READY(unicode)) {
689 Py_ssize_t char_size;
690 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200691 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 void *data;
693
694 data = _PyUnicode_DATA_ANY(unicode);
695 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200696 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200697 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
698 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200699 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
700 {
701 PyObject_DEL(_PyUnicode_UTF8(unicode));
702 _PyUnicode_UTF8(unicode) = NULL;
703 _PyUnicode_UTF8_LENGTH(unicode) = 0;
704 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705
706 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
707 PyErr_NoMemory();
708 return -1;
709 }
710 new_size = (length + 1) * char_size;
711
712 data = (PyObject *)PyObject_REALLOC(data, new_size);
713 if (data == NULL) {
714 PyErr_NoMemory();
715 return -1;
716 }
717 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200720 _PyUnicode_WSTR_LENGTH(unicode) = length;
721 }
722 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200723 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200724 _PyUnicode_UTF8_LENGTH(unicode) = length;
725 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 _PyUnicode_LENGTH(unicode) = length;
727 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200728 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200729 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732 }
Victor Stinner95663112011-10-04 01:03:50 +0200733 assert(_PyUnicode_WSTR(unicode) != NULL);
734
735 /* check for integer overflow */
736 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
737 PyErr_NoMemory();
738 return -1;
739 }
740 wstr = _PyUnicode_WSTR(unicode);
741 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
742 if (!wstr) {
743 PyErr_NoMemory();
744 return -1;
745 }
746 _PyUnicode_WSTR(unicode) = wstr;
747 _PyUnicode_WSTR(unicode)[length] = 0;
748 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200749 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000750 return 0;
751}
752
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753static PyObject*
754resize_copy(PyObject *unicode, Py_ssize_t length)
755{
756 Py_ssize_t copy_length;
757 if (PyUnicode_IS_COMPACT(unicode)) {
758 PyObject *copy;
759 assert(PyUnicode_IS_READY(unicode));
760
761 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
762 if (copy == NULL)
763 return NULL;
764
765 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200766 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200768 }
769 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200770 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771 assert(_PyUnicode_WSTR(unicode) != NULL);
772 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200773 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 if (w == NULL)
775 return NULL;
776 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
777 copy_length = Py_MIN(copy_length, length);
778 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
779 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200780 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 }
782}
783
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000785 Ux0000 terminated; some code (e.g. new_identifier)
786 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000787
788 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000789 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000790
791*/
792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200793#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200794static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795#endif
796
Alexander Belopolsky40018472011-02-26 01:02:56 +0000797static PyUnicodeObject *
798_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000799{
800 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200801 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802
Thomas Wouters477c8d52006-05-27 19:21:47 +0000803 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804 if (length == 0 && unicode_empty != NULL) {
805 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200806 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807 }
808
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000809 /* Ensure we won't overflow the size. */
810 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
811 return (PyUnicodeObject *)PyErr_NoMemory();
812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813 if (length < 0) {
814 PyErr_SetString(PyExc_SystemError,
815 "Negative size passed to _PyUnicode_New");
816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817 }
818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200819#ifdef Py_DEBUG
820 ++unicode_old_new_calls;
821#endif
822
823 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
824 if (unicode == NULL)
825 return NULL;
826 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
827 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
828 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000829 PyErr_NoMemory();
830 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832
Jeremy Hyltond8082792003-09-16 19:41:39 +0000833 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000834 * the caller fails before initializing str -- unicode_resize()
835 * reads str[0], and the Keep-Alive optimization can keep memory
836 * allocated for str alive across a call to unicode_dealloc(unicode).
837 * We don't want unicode_resize to read uninitialized memory in
838 * that case.
839 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200840 _PyUnicode_WSTR(unicode)[0] = 0;
841 _PyUnicode_WSTR(unicode)[length] = 0;
842 _PyUnicode_WSTR_LENGTH(unicode) = length;
843 _PyUnicode_HASH(unicode) = -1;
844 _PyUnicode_STATE(unicode).interned = 0;
845 _PyUnicode_STATE(unicode).kind = 0;
846 _PyUnicode_STATE(unicode).compact = 0;
847 _PyUnicode_STATE(unicode).ready = 0;
848 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200849 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200851 _PyUnicode_UTF8(unicode) = NULL;
852 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100853 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000854 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000855
Benjamin Peterson29060642009-01-31 22:14:21 +0000856 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000857 /* XXX UNREF/NEWREF interface should be more symmetrical */
858 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000859 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000860 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862}
863
Victor Stinnerf42dc442011-10-02 23:33:16 +0200864static const char*
865unicode_kind_name(PyObject *unicode)
866{
Victor Stinner42dfd712011-10-03 14:41:45 +0200867 /* don't check consistency: unicode_kind_name() is called from
868 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200869 if (!PyUnicode_IS_COMPACT(unicode))
870 {
871 if (!PyUnicode_IS_READY(unicode))
872 return "wstr";
873 switch(PyUnicode_KIND(unicode))
874 {
875 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200876 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200877 return "legacy ascii";
878 else
879 return "legacy latin1";
880 case PyUnicode_2BYTE_KIND:
881 return "legacy UCS2";
882 case PyUnicode_4BYTE_KIND:
883 return "legacy UCS4";
884 default:
885 return "<legacy invalid kind>";
886 }
887 }
888 assert(PyUnicode_IS_READY(unicode));
889 switch(PyUnicode_KIND(unicode))
890 {
891 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200892 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200893 return "ascii";
894 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200895 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200897 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200898 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200899 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 default:
901 return "<invalid compact kind>";
902 }
903}
904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200906static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200907
908/* Functions wrapping macros for use in debugger */
909char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200910 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200911}
912
913void *_PyUnicode_compact_data(void *unicode) {
914 return _PyUnicode_COMPACT_DATA(unicode);
915}
916void *_PyUnicode_data(void *unicode){
917 printf("obj %p\n", unicode);
918 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
919 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
920 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
921 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
922 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
923 return PyUnicode_DATA(unicode);
924}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200925
926void
927_PyUnicode_Dump(PyObject *op)
928{
929 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200930 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
931 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
932 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200933
Victor Stinnera849a4b2011-10-03 12:12:11 +0200934 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200935 {
936 if (ascii->state.ascii)
937 data = (ascii + 1);
938 else
939 data = (compact + 1);
940 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200941 else
942 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200943 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
944
Victor Stinnera849a4b2011-10-03 12:12:11 +0200945 if (ascii->wstr == data)
946 printf("shared ");
947 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200948
Victor Stinnera3b334d2011-10-03 13:53:37 +0200949 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200950 printf(" (%zu), ", compact->wstr_length);
951 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
952 printf("shared ");
953 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200954 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200956}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957#endif
958
959PyObject *
960PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
961{
962 PyObject *obj;
963 PyCompactUnicodeObject *unicode;
964 void *data;
965 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200966 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967 Py_ssize_t char_size;
968 Py_ssize_t struct_size;
969
970 /* Optimization for empty strings */
971 if (size == 0 && unicode_empty != NULL) {
972 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200973 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 }
975
976#ifdef Py_DEBUG
977 ++unicode_new_new_calls;
978#endif
979
Victor Stinner9e9d6892011-10-04 01:02:02 +0200980 is_ascii = 0;
981 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 struct_size = sizeof(PyCompactUnicodeObject);
983 if (maxchar < 128) {
984 kind_state = PyUnicode_1BYTE_KIND;
985 char_size = 1;
986 is_ascii = 1;
987 struct_size = sizeof(PyASCIIObject);
988 }
989 else if (maxchar < 256) {
990 kind_state = PyUnicode_1BYTE_KIND;
991 char_size = 1;
992 }
993 else if (maxchar < 65536) {
994 kind_state = PyUnicode_2BYTE_KIND;
995 char_size = 2;
996 if (sizeof(wchar_t) == 2)
997 is_sharing = 1;
998 }
999 else {
1000 kind_state = PyUnicode_4BYTE_KIND;
1001 char_size = 4;
1002 if (sizeof(wchar_t) == 4)
1003 is_sharing = 1;
1004 }
1005
1006 /* Ensure we won't overflow the size. */
1007 if (size < 0) {
1008 PyErr_SetString(PyExc_SystemError,
1009 "Negative size passed to PyUnicode_New");
1010 return NULL;
1011 }
1012 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1013 return PyErr_NoMemory();
1014
1015 /* Duplicated allocation code from _PyObject_New() instead of a call to
1016 * PyObject_New() so we are able to allocate space for the object and
1017 * it's data buffer.
1018 */
1019 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1020 if (obj == NULL)
1021 return PyErr_NoMemory();
1022 obj = PyObject_INIT(obj, &PyUnicode_Type);
1023 if (obj == NULL)
1024 return NULL;
1025
1026 unicode = (PyCompactUnicodeObject *)obj;
1027 if (is_ascii)
1028 data = ((PyASCIIObject*)obj) + 1;
1029 else
1030 data = unicode + 1;
1031 _PyUnicode_LENGTH(unicode) = size;
1032 _PyUnicode_HASH(unicode) = -1;
1033 _PyUnicode_STATE(unicode).interned = 0;
1034 _PyUnicode_STATE(unicode).kind = kind_state;
1035 _PyUnicode_STATE(unicode).compact = 1;
1036 _PyUnicode_STATE(unicode).ready = 1;
1037 _PyUnicode_STATE(unicode).ascii = is_ascii;
1038 if (is_ascii) {
1039 ((char*)data)[size] = 0;
1040 _PyUnicode_WSTR(unicode) = NULL;
1041 }
1042 else if (kind_state == PyUnicode_1BYTE_KIND) {
1043 ((char*)data)[size] = 0;
1044 _PyUnicode_WSTR(unicode) = NULL;
1045 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048 }
1049 else {
1050 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001051 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 if (kind_state == PyUnicode_2BYTE_KIND)
1053 ((Py_UCS2*)data)[size] = 0;
1054 else /* kind_state == PyUnicode_4BYTE_KIND */
1055 ((Py_UCS4*)data)[size] = 0;
1056 if (is_sharing) {
1057 _PyUnicode_WSTR_LENGTH(unicode) = size;
1058 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1059 }
1060 else {
1061 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1062 _PyUnicode_WSTR(unicode) = NULL;
1063 }
1064 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001065 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 return obj;
1067}
1068
1069#if SIZEOF_WCHAR_T == 2
1070/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1071 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001072 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073
1074 This function assumes that unicode can hold one more code point than wstr
1075 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001076static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001078 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079{
1080 const wchar_t *iter;
1081 Py_UCS4 *ucs4_out;
1082
Victor Stinner910337b2011-10-03 03:20:16 +02001083 assert(unicode != NULL);
1084 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001085 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1086 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1087
1088 for (iter = begin; iter < end; ) {
1089 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1090 _PyUnicode_GET_LENGTH(unicode)));
1091 if (*iter >= 0xD800 && *iter <= 0xDBFF
1092 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1093 {
1094 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1095 iter += 2;
1096 }
1097 else {
1098 *ucs4_out++ = *iter;
1099 iter++;
1100 }
1101 }
1102 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1103 _PyUnicode_GET_LENGTH(unicode)));
1104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105}
1106#endif
1107
Victor Stinnercd9950f2011-10-02 00:34:53 +02001108static int
1109_PyUnicode_Dirty(PyObject *unicode)
1110{
Victor Stinner910337b2011-10-03 03:20:16 +02001111 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001112 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001113 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001114 "Cannot modify a string having more than 1 reference");
1115 return -1;
1116 }
1117 _PyUnicode_DIRTY(unicode);
1118 return 0;
1119}
1120
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121static int
1122_copy_characters(PyObject *to, Py_ssize_t to_start,
1123 PyObject *from, Py_ssize_t from_start,
1124 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 unsigned int from_kind, to_kind;
1127 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001128 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_Check(from));
1131 assert(PyUnicode_Check(to));
1132 assert(PyUnicode_IS_READY(from));
1133 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001134
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1136 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1137 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001139 if (how_many == 0)
1140 return 0;
1141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001143 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001144 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001145 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147#ifdef Py_DEBUG
1148 if (!check_maxchar
1149 && (from_kind > to_kind
1150 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001151 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1153 Py_UCS4 ch;
1154 Py_ssize_t i;
1155 for (i=0; i < how_many; i++) {
1156 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1157 assert(ch <= to_maxchar);
1158 }
1159 }
1160#endif
1161 fast = (from_kind == to_kind);
1162 if (check_maxchar
1163 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1164 {
1165 /* deny latin1 => ascii */
1166 fast = 0;
1167 }
1168
1169 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001170 Py_MEMCPY((char*)to_data + to_kind * to_start,
1171 (char*)from_data + from_kind * from_start,
1172 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001174 else if (from_kind == PyUnicode_1BYTE_KIND
1175 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001176 {
1177 _PyUnicode_CONVERT_BYTES(
1178 Py_UCS1, Py_UCS2,
1179 PyUnicode_1BYTE_DATA(from) + from_start,
1180 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1181 PyUnicode_2BYTE_DATA(to) + to_start
1182 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001184 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001185 && to_kind == PyUnicode_4BYTE_KIND)
1186 {
1187 _PyUnicode_CONVERT_BYTES(
1188 Py_UCS1, Py_UCS4,
1189 PyUnicode_1BYTE_DATA(from) + from_start,
1190 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1191 PyUnicode_4BYTE_DATA(to) + to_start
1192 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001193 }
1194 else if (from_kind == PyUnicode_2BYTE_KIND
1195 && to_kind == PyUnicode_4BYTE_KIND)
1196 {
1197 _PyUnicode_CONVERT_BYTES(
1198 Py_UCS2, Py_UCS4,
1199 PyUnicode_2BYTE_DATA(from) + from_start,
1200 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1201 PyUnicode_4BYTE_DATA(to) + to_start
1202 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001205 /* check if max_char(from substring) <= max_char(to) */
1206 if (from_kind > to_kind
1207 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001208 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001209 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001210 /* slow path to check for character overflow */
1211 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 Py_ssize_t i;
1214
Victor Stinner56c161a2011-10-06 02:47:11 +02001215#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001216 for (i=0; i < how_many; i++) {
1217 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001218 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001219 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1220 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001221#else
1222 if (!check_maxchar) {
1223 for (i=0; i < how_many; i++) {
1224 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1226 }
1227 }
1228 else {
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 if (ch > to_maxchar)
1232 return 1;
1233 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1234 }
1235 }
1236#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(0 && "inconsistent state");
1240 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001241 }
1242 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001243 return 0;
1244}
1245
1246static void
1247copy_characters(PyObject *to, Py_ssize_t to_start,
1248 PyObject *from, Py_ssize_t from_start,
1249 Py_ssize_t how_many)
1250{
1251 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1252}
1253
1254Py_ssize_t
1255PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1256 PyObject *from, Py_ssize_t from_start,
1257 Py_ssize_t how_many)
1258{
1259 int err;
1260
1261 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1262 PyErr_BadInternalCall();
1263 return -1;
1264 }
1265
1266 if (PyUnicode_READY(from))
1267 return -1;
1268 if (PyUnicode_READY(to))
1269 return -1;
1270
1271 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1272 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1273 PyErr_Format(PyExc_SystemError,
1274 "Cannot write %zi characters at %zi "
1275 "in a string of %zi characters",
1276 how_many, to_start, PyUnicode_GET_LENGTH(to));
1277 return -1;
1278 }
1279
1280 if (how_many == 0)
1281 return 0;
1282
1283 if (_PyUnicode_Dirty(to))
1284 return -1;
1285
1286 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1287 if (err) {
1288 PyErr_Format(PyExc_SystemError,
1289 "Cannot copy %s characters "
1290 "into a string of %s characters",
1291 unicode_kind_name(from),
1292 unicode_kind_name(to));
1293 return -1;
1294 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001295 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296}
1297
Victor Stinner17222162011-09-28 22:15:37 +02001298/* Find the maximum code point and count the number of surrogate pairs so a
1299 correct string length can be computed before converting a string to UCS4.
1300 This function counts single surrogates as a character and not as a pair.
1301
1302 Return 0 on success, or -1 on error. */
1303static int
1304find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1305 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306{
1307 const wchar_t *iter;
1308
Victor Stinnerc53be962011-10-02 21:33:54 +02001309 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 *num_surrogates = 0;
1311 *maxchar = 0;
1312
1313 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001314 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001316#if SIZEOF_WCHAR_T != 2
1317 if (*maxchar >= 0x10000)
1318 return 0;
1319#endif
1320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321#if SIZEOF_WCHAR_T == 2
1322 if (*iter >= 0xD800 && *iter <= 0xDBFF
1323 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1324 {
1325 Py_UCS4 surrogate_val;
1326 surrogate_val = (((iter[0] & 0x3FF)<<10)
1327 | (iter[1] & 0x3FF)) + 0x10000;
1328 ++(*num_surrogates);
1329 if (surrogate_val > *maxchar)
1330 *maxchar = surrogate_val;
1331 iter += 2;
1332 }
1333 else
1334 iter++;
1335#else
1336 iter++;
1337#endif
1338 }
1339 return 0;
1340}
1341
1342#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001343static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344#endif
1345
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001346int
1347_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348{
1349 wchar_t *end;
1350 Py_UCS4 maxchar = 0;
1351 Py_ssize_t num_surrogates;
1352#if SIZEOF_WCHAR_T == 2
1353 Py_ssize_t length_wo_surrogates;
1354#endif
1355
Georg Brandl7597add2011-10-05 16:36:47 +02001356 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001357 strings were created using _PyObject_New() and where no canonical
1358 representation (the str field) has been set yet aka strings
1359 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001360 assert(_PyUnicode_CHECK(unicode));
1361 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001363 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001364 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001365 /* Actually, it should neither be interned nor be anything else: */
1366 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367
1368#ifdef Py_DEBUG
1369 ++unicode_ready_calls;
1370#endif
1371
1372 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001373 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001374 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376
1377 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001378 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1379 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 PyErr_NoMemory();
1381 return -1;
1382 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001383 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 _PyUnicode_WSTR(unicode), end,
1385 PyUnicode_1BYTE_DATA(unicode));
1386 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1387 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1388 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1389 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001390 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001391 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001392 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 }
1394 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001395 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001396 _PyUnicode_UTF8(unicode) = NULL;
1397 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398 }
1399 PyObject_FREE(_PyUnicode_WSTR(unicode));
1400 _PyUnicode_WSTR(unicode) = NULL;
1401 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1402 }
1403 /* In this case we might have to convert down from 4-byte native
1404 wchar_t to 2-byte unicode. */
1405 else if (maxchar < 65536) {
1406 assert(num_surrogates == 0 &&
1407 "FindMaxCharAndNumSurrogatePairs() messed up");
1408
Victor Stinner506f5922011-09-28 22:34:18 +02001409#if SIZEOF_WCHAR_T == 2
1410 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001411 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001412 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1413 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1414 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001415 _PyUnicode_UTF8(unicode) = NULL;
1416 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001417#else
1418 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001419 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001420 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001421 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001422 PyErr_NoMemory();
1423 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 }
Victor Stinner506f5922011-09-28 22:34:18 +02001425 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1426 _PyUnicode_WSTR(unicode), end,
1427 PyUnicode_2BYTE_DATA(unicode));
1428 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1429 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1430 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001431 _PyUnicode_UTF8(unicode) = NULL;
1432 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001433 PyObject_FREE(_PyUnicode_WSTR(unicode));
1434 _PyUnicode_WSTR(unicode) = NULL;
1435 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1436#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1439 else {
1440#if SIZEOF_WCHAR_T == 2
1441 /* in case the native representation is 2-bytes, we need to allocate a
1442 new normalized 4-byte version. */
1443 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001444 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1445 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 PyErr_NoMemory();
1447 return -1;
1448 }
1449 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1450 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001451 _PyUnicode_UTF8(unicode) = NULL;
1452 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001453 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1454 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001455 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001456 PyObject_FREE(_PyUnicode_WSTR(unicode));
1457 _PyUnicode_WSTR(unicode) = NULL;
1458 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1459#else
1460 assert(num_surrogates == 0);
1461
Victor Stinnerc3c74152011-10-02 20:39:55 +02001462 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001464 _PyUnicode_UTF8(unicode) = NULL;
1465 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1467#endif
1468 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1469 }
1470 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001471 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 return 0;
1473}
1474
Alexander Belopolsky40018472011-02-26 01:02:56 +00001475static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001476unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477{
Walter Dörwald16807132007-05-25 13:52:07 +00001478 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001479 case SSTATE_NOT_INTERNED:
1480 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001481
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 case SSTATE_INTERNED_MORTAL:
1483 /* revive dead object temporarily for DelItem */
1484 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001485 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001486 Py_FatalError(
1487 "deletion of interned string failed");
1488 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001489
Benjamin Peterson29060642009-01-31 22:14:21 +00001490 case SSTATE_INTERNED_IMMORTAL:
1491 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001492
Benjamin Peterson29060642009-01-31 22:14:21 +00001493 default:
1494 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001495 }
1496
Victor Stinner03490912011-10-03 23:45:12 +02001497 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001499 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001500 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501
1502 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001503 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504 }
1505 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001506 if (_PyUnicode_DATA_ANY(unicode))
1507 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001508 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 }
1510}
1511
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001512#ifdef Py_DEBUG
1513static int
1514unicode_is_singleton(PyObject *unicode)
1515{
1516 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1517 if (unicode == unicode_empty)
1518 return 1;
1519 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1520 {
1521 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1522 if (ch < 256 && unicode_latin1[ch] == unicode)
1523 return 1;
1524 }
1525 return 0;
1526}
1527#endif
1528
Alexander Belopolsky40018472011-02-26 01:02:56 +00001529static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001530unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001531{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 if (Py_REFCNT(unicode) != 1)
1533 return 0;
1534 if (PyUnicode_CHECK_INTERNED(unicode))
1535 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001536#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001537 /* singleton refcount is greater than 1 */
1538 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001539#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001540 return 1;
1541}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001542
Victor Stinnerfe226c02011-10-03 03:52:20 +02001543static int
1544unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1545{
1546 PyObject *unicode;
1547 Py_ssize_t old_length;
1548
1549 assert(p_unicode != NULL);
1550 unicode = *p_unicode;
1551
1552 assert(unicode != NULL);
1553 assert(PyUnicode_Check(unicode));
1554 assert(0 <= length);
1555
Victor Stinner910337b2011-10-03 03:20:16 +02001556 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001557 old_length = PyUnicode_WSTR_LENGTH(unicode);
1558 else
1559 old_length = PyUnicode_GET_LENGTH(unicode);
1560 if (old_length == length)
1561 return 0;
1562
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001563 if (length == 0) {
1564 Py_DECREF(*p_unicode);
1565 *p_unicode = unicode_empty;
1566 Py_INCREF(*p_unicode);
1567 return 0;
1568 }
1569
Victor Stinnerfe226c02011-10-03 03:52:20 +02001570 if (!unicode_resizable(unicode)) {
1571 PyObject *copy = resize_copy(unicode, length);
1572 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001574 Py_DECREF(*p_unicode);
1575 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001576 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001577 }
1578
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579 if (PyUnicode_IS_COMPACT(unicode)) {
1580 *p_unicode = resize_compact(unicode, length);
1581 if (*p_unicode == NULL)
1582 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001583 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001585 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001586 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001587}
1588
Alexander Belopolsky40018472011-02-26 01:02:56 +00001589int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001590PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001591{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592 PyObject *unicode;
1593 if (p_unicode == NULL) {
1594 PyErr_BadInternalCall();
1595 return -1;
1596 }
1597 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001598 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 {
1600 PyErr_BadInternalCall();
1601 return -1;
1602 }
1603 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001604}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001605
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001606static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001607unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001608{
1609 PyObject *result;
1610 assert(PyUnicode_IS_READY(*p_unicode));
1611 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1612 return 0;
1613 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1614 maxchar);
1615 if (result == NULL)
1616 return -1;
1617 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1618 PyUnicode_GET_LENGTH(*p_unicode));
1619 Py_DECREF(*p_unicode);
1620 *p_unicode = result;
1621 return 0;
1622}
1623
1624static int
1625unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1626 Py_UCS4 ch)
1627{
1628 if (unicode_widen(p_unicode, ch) < 0)
1629 return -1;
1630 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1631 PyUnicode_DATA(*p_unicode),
1632 (*pos)++, ch);
1633 return 0;
1634}
1635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636static PyObject*
1637get_latin1_char(unsigned char ch)
1638{
Victor Stinnera464fc12011-10-02 20:39:30 +02001639 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001640 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001641 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 if (!unicode)
1643 return NULL;
1644 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001645 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646 unicode_latin1[ch] = unicode;
1647 }
1648 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001649 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001650}
1651
Alexander Belopolsky40018472011-02-26 01:02:56 +00001652PyObject *
1653PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001655 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001656 Py_UCS4 maxchar = 0;
1657 Py_ssize_t num_surrogates;
1658
1659 if (u == NULL)
1660 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001662 /* If the Unicode data is known at construction time, we can apply
1663 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001665 /* Optimization for empty strings */
1666 if (size == 0 && unicode_empty != NULL) {
1667 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001668 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001669 }
Tim Petersced69f82003-09-16 20:30:58 +00001670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 /* Single character Unicode objects in the Latin-1 range are
1672 shared when using this constructor */
1673 if (size == 1 && *u < 256)
1674 return get_latin1_char((unsigned char)*u);
1675
1676 /* If not empty and not single character, copy the Unicode data
1677 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001678 if (find_maxchar_surrogates(u, u + size,
1679 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001680 return NULL;
1681
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001682 unicode = PyUnicode_New(size - num_surrogates,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 if (!unicode)
1685 return NULL;
1686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 switch (PyUnicode_KIND(unicode)) {
1688 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001689 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001690 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1691 break;
1692 case PyUnicode_2BYTE_KIND:
1693#if Py_UNICODE_SIZE == 2
1694 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1695#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001696 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1698#endif
1699 break;
1700 case PyUnicode_4BYTE_KIND:
1701#if SIZEOF_WCHAR_T == 2
1702 /* This is the only case which has to process surrogates, thus
1703 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001704 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001705#else
1706 assert(num_surrogates == 0);
1707 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1708#endif
1709 break;
1710 default:
1711 assert(0 && "Impossible state");
1712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001714 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715}
1716
Alexander Belopolsky40018472011-02-26 01:02:56 +00001717PyObject *
1718PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001719{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001720 if (size < 0) {
1721 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001722 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001723 return NULL;
1724 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001725
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001726 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001727 some optimizations which share commonly used objects.
1728 Also, this means the input must be UTF-8, so fall back to the
1729 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001730 if (u != NULL) {
1731
Benjamin Peterson29060642009-01-31 22:14:21 +00001732 /* Optimization for empty strings */
1733 if (size == 0 && unicode_empty != NULL) {
1734 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001735 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001736 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001737
1738 /* Single characters are shared when using this constructor.
1739 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001740 if (size == 1 && (unsigned char)*u < 128)
1741 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001742
1743 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001744 }
1745
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001746 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001747}
1748
Alexander Belopolsky40018472011-02-26 01:02:56 +00001749PyObject *
1750PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001751{
1752 size_t size = strlen(u);
1753 if (size > PY_SSIZE_T_MAX) {
1754 PyErr_SetString(PyExc_OverflowError, "input too long");
1755 return NULL;
1756 }
1757
1758 return PyUnicode_FromStringAndSize(u, size);
1759}
1760
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001761PyObject *
1762_PyUnicode_FromId(_Py_Identifier *id)
1763{
1764 if (!id->object) {
1765 id->object = PyUnicode_FromString(id->string);
1766 if (!id->object)
1767 return NULL;
1768 PyUnicode_InternInPlace(&id->object);
1769 assert(!id->next);
1770 id->next = static_strings;
1771 static_strings = id;
1772 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001773 return id->object;
1774}
1775
1776void
1777_PyUnicode_ClearStaticStrings()
1778{
1779 _Py_Identifier *i;
1780 for (i = static_strings; i; i = i->next) {
1781 Py_DECREF(i->object);
1782 i->object = NULL;
1783 i->next = NULL;
1784 }
1785}
1786
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001787/* Internal function, don't check maximum character */
1788
Victor Stinnere57b1c02011-09-28 22:20:48 +02001789static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001790unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001791{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001792 PyObject *res;
1793#ifdef Py_DEBUG
1794 const unsigned char *p;
1795 const unsigned char *end = s + size;
1796 for (p=s; p < end; p++) {
1797 assert(*p < 128);
1798 }
1799#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001800 if (size == 1)
1801 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001802 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001803 if (!res)
1804 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001805 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001806 return res;
1807}
1808
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001809static Py_UCS4
1810kind_maxchar_limit(unsigned int kind)
1811{
1812 switch(kind) {
1813 case PyUnicode_1BYTE_KIND:
1814 return 0x80;
1815 case PyUnicode_2BYTE_KIND:
1816 return 0x100;
1817 case PyUnicode_4BYTE_KIND:
1818 return 0x10000;
1819 default:
1820 assert(0 && "invalid kind");
1821 return 0x10ffff;
1822 }
1823}
1824
Victor Stinner702c7342011-10-05 13:50:52 +02001825static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001826_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001827{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001828 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001829 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001830
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001831 if (size == 0) {
1832 Py_INCREF(unicode_empty);
1833 return unicode_empty;
1834 }
1835 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001836 if (size == 1)
1837 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001838
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001839 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001840 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 if (!res)
1842 return NULL;
1843 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001844 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001846}
1847
Victor Stinnere57b1c02011-09-28 22:20:48 +02001848static PyObject*
1849_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850{
1851 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001852 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001853
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001854 if (size == 0) {
1855 Py_INCREF(unicode_empty);
1856 return unicode_empty;
1857 }
1858 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001859 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001860 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001861
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001862 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001863 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 if (!res)
1865 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001866 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001867 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001868 else {
1869 _PyUnicode_CONVERT_BYTES(
1870 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1871 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001872 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873 return res;
1874}
1875
Victor Stinnere57b1c02011-09-28 22:20:48 +02001876static PyObject*
1877_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878{
1879 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001880 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001881
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001882 if (size == 0) {
1883 Py_INCREF(unicode_empty);
1884 return unicode_empty;
1885 }
1886 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001887 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001888 return get_latin1_char((unsigned char)u[0]);
1889
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001890 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001891 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 if (!res)
1893 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001894 if (max_char < 256)
1895 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1896 PyUnicode_1BYTE_DATA(res));
1897 else if (max_char < 0x10000)
1898 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1899 PyUnicode_2BYTE_DATA(res));
1900 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001902 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 return res;
1904}
1905
1906PyObject*
1907PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1908{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001909 if (size < 0) {
1910 PyErr_SetString(PyExc_ValueError, "size must be positive");
1911 return NULL;
1912 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 switch(kind) {
1914 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001915 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001917 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001919 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001920 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001921 PyErr_SetString(PyExc_SystemError, "invalid kind");
1922 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001924}
1925
Victor Stinner25a4b292011-10-06 12:31:55 +02001926/* Ensure that a string uses the most efficient storage, if it is not the
1927 case: create a new string with of the right kind. Write NULL into *p_unicode
1928 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001929static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001930unicode_adjust_maxchar(PyObject **p_unicode)
1931{
1932 PyObject *unicode, *copy;
1933 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001934 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001935 unsigned int kind;
1936
1937 assert(p_unicode != NULL);
1938 unicode = *p_unicode;
1939 assert(PyUnicode_IS_READY(unicode));
1940 if (PyUnicode_IS_ASCII(unicode))
1941 return;
1942
1943 len = PyUnicode_GET_LENGTH(unicode);
1944 kind = PyUnicode_KIND(unicode);
1945 if (kind == PyUnicode_1BYTE_KIND) {
1946 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001947 max_char = ucs1lib_find_max_char(u, u + len);
1948 if (max_char >= 128)
1949 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001950 }
1951 else if (kind == PyUnicode_2BYTE_KIND) {
1952 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001953 max_char = ucs2lib_find_max_char(u, u + len);
1954 if (max_char >= 256)
1955 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001956 }
1957 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001958 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001959 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001960 max_char = ucs4lib_find_max_char(u, u + len);
1961 if (max_char >= 0x10000)
1962 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001963 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001964 copy = PyUnicode_New(len, max_char);
1965 copy_characters(copy, 0, unicode, 0, len);
1966 Py_DECREF(unicode);
1967 *p_unicode = copy;
1968}
1969
Victor Stinner034f6cf2011-09-30 02:26:44 +02001970PyObject*
1971PyUnicode_Copy(PyObject *unicode)
1972{
Victor Stinner87af4f22011-11-21 23:03:47 +01001973 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001974 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001975
Victor Stinner034f6cf2011-09-30 02:26:44 +02001976 if (!PyUnicode_Check(unicode)) {
1977 PyErr_BadInternalCall();
1978 return NULL;
1979 }
1980 if (PyUnicode_READY(unicode))
1981 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001982
Victor Stinner87af4f22011-11-21 23:03:47 +01001983 length = PyUnicode_GET_LENGTH(unicode);
1984 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001985 if (!copy)
1986 return NULL;
1987 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1988
Victor Stinner87af4f22011-11-21 23:03:47 +01001989 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1990 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001991 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001992 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001993}
1994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001995
Victor Stinnerbc603d12011-10-02 01:00:40 +02001996/* Widen Unicode objects to larger buffers. Don't write terminating null
1997 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998
1999void*
2000_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2001{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002002 Py_ssize_t len;
2003 void *result;
2004 unsigned int skind;
2005
2006 if (PyUnicode_READY(s))
2007 return NULL;
2008
2009 len = PyUnicode_GET_LENGTH(s);
2010 skind = PyUnicode_KIND(s);
2011 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002012 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002013 return NULL;
2014 }
2015 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002016 case PyUnicode_2BYTE_KIND:
2017 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2018 if (!result)
2019 return PyErr_NoMemory();
2020 assert(skind == PyUnicode_1BYTE_KIND);
2021 _PyUnicode_CONVERT_BYTES(
2022 Py_UCS1, Py_UCS2,
2023 PyUnicode_1BYTE_DATA(s),
2024 PyUnicode_1BYTE_DATA(s) + len,
2025 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002027 case PyUnicode_4BYTE_KIND:
2028 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2029 if (!result)
2030 return PyErr_NoMemory();
2031 if (skind == PyUnicode_2BYTE_KIND) {
2032 _PyUnicode_CONVERT_BYTES(
2033 Py_UCS2, Py_UCS4,
2034 PyUnicode_2BYTE_DATA(s),
2035 PyUnicode_2BYTE_DATA(s) + len,
2036 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002038 else {
2039 assert(skind == PyUnicode_1BYTE_KIND);
2040 _PyUnicode_CONVERT_BYTES(
2041 Py_UCS1, Py_UCS4,
2042 PyUnicode_1BYTE_DATA(s),
2043 PyUnicode_1BYTE_DATA(s) + len,
2044 result);
2045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002047 default:
2048 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002049 }
Victor Stinner01698042011-10-04 00:04:26 +02002050 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002051 return NULL;
2052}
2053
2054static Py_UCS4*
2055as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2056 int copy_null)
2057{
2058 int kind;
2059 void *data;
2060 Py_ssize_t len, targetlen;
2061 if (PyUnicode_READY(string) == -1)
2062 return NULL;
2063 kind = PyUnicode_KIND(string);
2064 data = PyUnicode_DATA(string);
2065 len = PyUnicode_GET_LENGTH(string);
2066 targetlen = len;
2067 if (copy_null)
2068 targetlen++;
2069 if (!target) {
2070 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2071 PyErr_NoMemory();
2072 return NULL;
2073 }
2074 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2075 if (!target) {
2076 PyErr_NoMemory();
2077 return NULL;
2078 }
2079 }
2080 else {
2081 if (targetsize < targetlen) {
2082 PyErr_Format(PyExc_SystemError,
2083 "string is longer than the buffer");
2084 if (copy_null && 0 < targetsize)
2085 target[0] = 0;
2086 return NULL;
2087 }
2088 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002089 if (kind == PyUnicode_1BYTE_KIND) {
2090 Py_UCS1 *start = (Py_UCS1 *) data;
2091 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002093 else if (kind == PyUnicode_2BYTE_KIND) {
2094 Py_UCS2 *start = (Py_UCS2 *) data;
2095 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2096 }
2097 else {
2098 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002101 if (copy_null)
2102 target[len] = 0;
2103 return target;
2104}
2105
2106Py_UCS4*
2107PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2108 int copy_null)
2109{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002110 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002111 PyErr_BadInternalCall();
2112 return NULL;
2113 }
2114 return as_ucs4(string, target, targetsize, copy_null);
2115}
2116
2117Py_UCS4*
2118PyUnicode_AsUCS4Copy(PyObject *string)
2119{
2120 return as_ucs4(string, NULL, 0, 1);
2121}
2122
2123#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002124
Alexander Belopolsky40018472011-02-26 01:02:56 +00002125PyObject *
2126PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002129 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002130 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002131 PyErr_BadInternalCall();
2132 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 }
2134
Martin v. Löwis790465f2008-04-05 20:41:37 +00002135 if (size == -1) {
2136 size = wcslen(w);
2137 }
2138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140}
2141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002143
Walter Dörwald346737f2007-05-31 10:44:43 +00002144static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002145makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2146 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002147{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002148 *fmt++ = '%';
2149 if (width) {
2150 if (zeropad)
2151 *fmt++ = '0';
2152 fmt += sprintf(fmt, "%d", width);
2153 }
2154 if (precision)
2155 fmt += sprintf(fmt, ".%d", precision);
2156 if (longflag)
2157 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002158 else if (longlongflag) {
2159 /* longlongflag should only ever be nonzero on machines with
2160 HAVE_LONG_LONG defined */
2161#ifdef HAVE_LONG_LONG
2162 char *f = PY_FORMAT_LONG_LONG;
2163 while (*f)
2164 *fmt++ = *f++;
2165#else
2166 /* we shouldn't ever get here */
2167 assert(0);
2168 *fmt++ = 'l';
2169#endif
2170 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002171 else if (size_tflag) {
2172 char *f = PY_FORMAT_SIZE_T;
2173 while (*f)
2174 *fmt++ = *f++;
2175 }
2176 *fmt++ = c;
2177 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002178}
2179
Victor Stinner96865452011-03-01 23:44:09 +00002180/* helper for PyUnicode_FromFormatV() */
2181
2182static const char*
2183parse_format_flags(const char *f,
2184 int *p_width, int *p_precision,
2185 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2186{
2187 int width, precision, longflag, longlongflag, size_tflag;
2188
2189 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2190 f++;
2191 width = 0;
2192 while (Py_ISDIGIT((unsigned)*f))
2193 width = (width*10) + *f++ - '0';
2194 precision = 0;
2195 if (*f == '.') {
2196 f++;
2197 while (Py_ISDIGIT((unsigned)*f))
2198 precision = (precision*10) + *f++ - '0';
2199 if (*f == '%') {
2200 /* "%.3%s" => f points to "3" */
2201 f--;
2202 }
2203 }
2204 if (*f == '\0') {
2205 /* bogus format "%.1" => go backward, f points to "1" */
2206 f--;
2207 }
2208 if (p_width != NULL)
2209 *p_width = width;
2210 if (p_precision != NULL)
2211 *p_precision = precision;
2212
2213 /* Handle %ld, %lu, %lld and %llu. */
2214 longflag = 0;
2215 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002216 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002217
2218 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002219 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002220 longflag = 1;
2221 ++f;
2222 }
2223#ifdef HAVE_LONG_LONG
2224 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002225 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002226 longlongflag = 1;
2227 f += 2;
2228 }
2229#endif
2230 }
2231 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002232 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002233 size_tflag = 1;
2234 ++f;
2235 }
2236 if (p_longflag != NULL)
2237 *p_longflag = longflag;
2238 if (p_longlongflag != NULL)
2239 *p_longlongflag = longlongflag;
2240 if (p_size_tflag != NULL)
2241 *p_size_tflag = size_tflag;
2242 return f;
2243}
2244
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002245/* maximum number of characters required for output of %ld. 21 characters
2246 allows for 64-bit integers (in decimal) and an optional sign. */
2247#define MAX_LONG_CHARS 21
2248/* maximum number of characters required for output of %lld.
2249 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2250 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2251#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2252
Walter Dörwaldd2034312007-05-18 16:29:38 +00002253PyObject *
2254PyUnicode_FromFormatV(const char *format, va_list vargs)
2255{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002256 va_list count;
2257 Py_ssize_t callcount = 0;
2258 PyObject **callresults = NULL;
2259 PyObject **callresult = NULL;
2260 Py_ssize_t n = 0;
2261 int width = 0;
2262 int precision = 0;
2263 int zeropad;
2264 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002265 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002266 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002267 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002268 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2269 Py_UCS4 argmaxchar;
2270 Py_ssize_t numbersize = 0;
2271 char *numberresults = NULL;
2272 char *numberresult = NULL;
2273 Py_ssize_t i;
2274 int kind;
2275 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002276
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002277 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002278 /* step 1: count the number of %S/%R/%A/%s format specifications
2279 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2280 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002282 * also estimate a upper bound for all the number formats in the string,
2283 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002285 for (f = format; *f; f++) {
2286 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002287 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002288 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2289 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2290 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2291 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002293 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002294#ifdef HAVE_LONG_LONG
2295 if (longlongflag) {
2296 if (width < MAX_LONG_LONG_CHARS)
2297 width = MAX_LONG_LONG_CHARS;
2298 }
2299 else
2300#endif
2301 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2302 including sign. Decimal takes the most space. This
2303 isn't enough for octal. If a width is specified we
2304 need more (which we allocate later). */
2305 if (width < MAX_LONG_CHARS)
2306 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002307
2308 /* account for the size + '\0' to separate numbers
2309 inside of the numberresults buffer */
2310 numbersize += (width + 1);
2311 }
2312 }
2313 else if ((unsigned char)*f > 127) {
2314 PyErr_Format(PyExc_ValueError,
2315 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2316 "string, got a non-ASCII byte: 0x%02x",
2317 (unsigned char)*f);
2318 return NULL;
2319 }
2320 }
2321 /* step 2: allocate memory for the results of
2322 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2323 if (callcount) {
2324 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2325 if (!callresults) {
2326 PyErr_NoMemory();
2327 return NULL;
2328 }
2329 callresult = callresults;
2330 }
2331 /* step 2.5: allocate memory for the results of formating numbers */
2332 if (numbersize) {
2333 numberresults = PyObject_Malloc(numbersize);
2334 if (!numberresults) {
2335 PyErr_NoMemory();
2336 goto fail;
2337 }
2338 numberresult = numberresults;
2339 }
2340
2341 /* step 3: format numbers and figure out how large a buffer we need */
2342 for (f = format; *f; f++) {
2343 if (*f == '%') {
2344 const char* p;
2345 int longflag;
2346 int longlongflag;
2347 int size_tflag;
2348 int numprinted;
2349
2350 p = f;
2351 zeropad = (f[1] == '0');
2352 f = parse_format_flags(f, &width, &precision,
2353 &longflag, &longlongflag, &size_tflag);
2354 switch (*f) {
2355 case 'c':
2356 {
2357 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002358 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002359 n++;
2360 break;
2361 }
2362 case '%':
2363 n++;
2364 break;
2365 case 'i':
2366 case 'd':
2367 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2368 width, precision, *f);
2369 if (longflag)
2370 numprinted = sprintf(numberresult, fmt,
2371 va_arg(count, long));
2372#ifdef HAVE_LONG_LONG
2373 else if (longlongflag)
2374 numprinted = sprintf(numberresult, fmt,
2375 va_arg(count, PY_LONG_LONG));
2376#endif
2377 else if (size_tflag)
2378 numprinted = sprintf(numberresult, fmt,
2379 va_arg(count, Py_ssize_t));
2380 else
2381 numprinted = sprintf(numberresult, fmt,
2382 va_arg(count, int));
2383 n += numprinted;
2384 /* advance by +1 to skip over the '\0' */
2385 numberresult += (numprinted + 1);
2386 assert(*(numberresult - 1) == '\0');
2387 assert(*(numberresult - 2) != '\0');
2388 assert(numprinted >= 0);
2389 assert(numberresult <= numberresults + numbersize);
2390 break;
2391 case 'u':
2392 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2393 width, precision, 'u');
2394 if (longflag)
2395 numprinted = sprintf(numberresult, fmt,
2396 va_arg(count, unsigned long));
2397#ifdef HAVE_LONG_LONG
2398 else if (longlongflag)
2399 numprinted = sprintf(numberresult, fmt,
2400 va_arg(count, unsigned PY_LONG_LONG));
2401#endif
2402 else if (size_tflag)
2403 numprinted = sprintf(numberresult, fmt,
2404 va_arg(count, size_t));
2405 else
2406 numprinted = sprintf(numberresult, fmt,
2407 va_arg(count, unsigned int));
2408 n += numprinted;
2409 numberresult += (numprinted + 1);
2410 assert(*(numberresult - 1) == '\0');
2411 assert(*(numberresult - 2) != '\0');
2412 assert(numprinted >= 0);
2413 assert(numberresult <= numberresults + numbersize);
2414 break;
2415 case 'x':
2416 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2417 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2418 n += numprinted;
2419 numberresult += (numprinted + 1);
2420 assert(*(numberresult - 1) == '\0');
2421 assert(*(numberresult - 2) != '\0');
2422 assert(numprinted >= 0);
2423 assert(numberresult <= numberresults + numbersize);
2424 break;
2425 case 'p':
2426 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2427 /* %p is ill-defined: ensure leading 0x. */
2428 if (numberresult[1] == 'X')
2429 numberresult[1] = 'x';
2430 else if (numberresult[1] != 'x') {
2431 memmove(numberresult + 2, numberresult,
2432 strlen(numberresult) + 1);
2433 numberresult[0] = '0';
2434 numberresult[1] = 'x';
2435 numprinted += 2;
2436 }
2437 n += numprinted;
2438 numberresult += (numprinted + 1);
2439 assert(*(numberresult - 1) == '\0');
2440 assert(*(numberresult - 2) != '\0');
2441 assert(numprinted >= 0);
2442 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002443 break;
2444 case 's':
2445 {
2446 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002447 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002448 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2449 if (!str)
2450 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 /* since PyUnicode_DecodeUTF8 returns already flexible
2452 unicode objects, there is no need to call ready on them */
2453 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002454 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002456 /* Remember the str and switch to the next slot */
2457 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002458 break;
2459 }
2460 case 'U':
2461 {
2462 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002463 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 if (PyUnicode_READY(obj) == -1)
2465 goto fail;
2466 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002467 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002469 break;
2470 }
2471 case 'V':
2472 {
2473 PyObject *obj = va_arg(count, PyObject *);
2474 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002475 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002476 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002477 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002478 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 if (PyUnicode_READY(obj) == -1)
2480 goto fail;
2481 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002482 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002484 *callresult++ = NULL;
2485 }
2486 else {
2487 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2488 if (!str_obj)
2489 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002490 if (PyUnicode_READY(str_obj)) {
2491 Py_DECREF(str_obj);
2492 goto fail;
2493 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002495 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002497 *callresult++ = str_obj;
2498 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002499 break;
2500 }
2501 case 'S':
2502 {
2503 PyObject *obj = va_arg(count, PyObject *);
2504 PyObject *str;
2505 assert(obj);
2506 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002508 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002509 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002510 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002511 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002512 /* Remember the str and switch to the next slot */
2513 *callresult++ = str;
2514 break;
2515 }
2516 case 'R':
2517 {
2518 PyObject *obj = va_arg(count, PyObject *);
2519 PyObject *repr;
2520 assert(obj);
2521 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002522 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002523 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002524 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002525 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002527 /* Remember the repr and switch to the next slot */
2528 *callresult++ = repr;
2529 break;
2530 }
2531 case 'A':
2532 {
2533 PyObject *obj = va_arg(count, PyObject *);
2534 PyObject *ascii;
2535 assert(obj);
2536 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002540 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002541 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002542 /* Remember the repr and switch to the next slot */
2543 *callresult++ = ascii;
2544 break;
2545 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002546 default:
2547 /* if we stumble upon an unknown
2548 formatting code, copy the rest of
2549 the format string to the output
2550 string. (we cannot just skip the
2551 code, since there's no way to know
2552 what's in the argument list) */
2553 n += strlen(p);
2554 goto expand;
2555 }
2556 } else
2557 n++;
2558 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002559 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 we don't have to resize the string.
2563 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002564 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 if (!string)
2566 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 kind = PyUnicode_KIND(string);
2568 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002572 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002573 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002574 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002575
2576 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2578 /* checking for == because the last argument could be a empty
2579 string, which causes i to point to end, the assert at the end of
2580 the loop */
2581 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002582
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 switch (*f) {
2584 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002585 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586 const int ordinal = va_arg(vargs, int);
2587 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002589 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002590 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002592 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002593 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002594 case 'p':
2595 /* unused, since we already have the result */
2596 if (*f == 'p')
2597 (void) va_arg(vargs, void *);
2598 else
2599 (void) va_arg(vargs, int);
2600 /* extract the result from numberresults and append. */
2601 for (; *numberresult; ++i, ++numberresult)
2602 PyUnicode_WRITE(kind, data, i, *numberresult);
2603 /* skip over the separating '\0' */
2604 assert(*numberresult == '\0');
2605 numberresult++;
2606 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002607 break;
2608 case 's':
2609 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002610 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002612 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 size = PyUnicode_GET_LENGTH(*callresult);
2614 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002615 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002617 /* We're done with the unicode()/repr() => forget it */
2618 Py_DECREF(*callresult);
2619 /* switch to next unicode()/repr() result */
2620 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 break;
2622 }
2623 case 'U':
2624 {
2625 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 Py_ssize_t size;
2627 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2628 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002629 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 break;
2632 }
2633 case 'V':
2634 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002637 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 size = PyUnicode_GET_LENGTH(obj);
2640 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002641 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002644 size = PyUnicode_GET_LENGTH(*callresult);
2645 assert(PyUnicode_KIND(*callresult) <=
2646 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002647 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002648 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002649 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002651 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 break;
2653 }
2654 case 'S':
2655 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002656 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002658 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 /* unused, since we already have the result */
2660 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002661 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002662 copy_characters(string, i, *callresult, 0, size);
2663 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002664 /* We're done with the unicode()/repr() => forget it */
2665 Py_DECREF(*callresult);
2666 /* switch to next unicode()/repr() result */
2667 ++callresult;
2668 break;
2669 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002671 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002672 break;
2673 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 for (; *p; ++p, ++i)
2675 PyUnicode_WRITE(kind, data, i, *p);
2676 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002677 goto end;
2678 }
Victor Stinner1205f272010-09-11 00:54:47 +00002679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002680 else {
2681 assert(i < PyUnicode_GET_LENGTH(string));
2682 PyUnicode_WRITE(kind, data, i++, *f);
2683 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002685 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002686
Benjamin Peterson29060642009-01-31 22:14:21 +00002687 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002688 if (callresults)
2689 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 if (numberresults)
2691 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002692 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002693 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 if (callresults) {
2695 PyObject **callresult2 = callresults;
2696 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002697 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 ++callresult2;
2699 }
2700 PyObject_Free(callresults);
2701 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 if (numberresults)
2703 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002704 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705}
2706
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707PyObject *
2708PyUnicode_FromFormat(const char *format, ...)
2709{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 PyObject* ret;
2711 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002712
2713#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002715#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002718 ret = PyUnicode_FromFormatV(format, vargs);
2719 va_end(vargs);
2720 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002721}
2722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723#ifdef HAVE_WCHAR_H
2724
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2726 convert a Unicode object to a wide character string.
2727
Victor Stinnerd88d9832011-09-06 02:00:05 +02002728 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002729 character) required to convert the unicode object. Ignore size argument.
2730
Victor Stinnerd88d9832011-09-06 02:00:05 +02002731 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002732 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002733 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002734static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002735unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002736 wchar_t *w,
2737 Py_ssize_t size)
2738{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002739 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 const wchar_t *wstr;
2741
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002742 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 if (wstr == NULL)
2744 return -1;
2745
Victor Stinner5593d8a2010-10-02 11:11:27 +00002746 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002747 if (size > res)
2748 size = res + 1;
2749 else
2750 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002752 return res;
2753 }
2754 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002756}
2757
2758Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002759PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002760 wchar_t *w,
2761 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762{
2763 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 PyErr_BadInternalCall();
2765 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002767 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768}
2769
Victor Stinner137c34c2010-09-29 10:25:54 +00002770wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002771PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002772 Py_ssize_t *size)
2773{
2774 wchar_t* buffer;
2775 Py_ssize_t buflen;
2776
2777 if (unicode == NULL) {
2778 PyErr_BadInternalCall();
2779 return NULL;
2780 }
2781
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002782 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 if (buflen == -1)
2784 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002786 PyErr_NoMemory();
2787 return NULL;
2788 }
2789
Victor Stinner137c34c2010-09-29 10:25:54 +00002790 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2791 if (buffer == NULL) {
2792 PyErr_NoMemory();
2793 return NULL;
2794 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002795 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002796 if (buflen == -1)
2797 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002798 if (size != NULL)
2799 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002800 return buffer;
2801}
2802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002803#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804
Alexander Belopolsky40018472011-02-26 01:02:56 +00002805PyObject *
2806PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002808 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002809 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002810 PyErr_SetString(PyExc_ValueError,
2811 "chr() arg not in range(0x110000)");
2812 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002813 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815 if (ordinal < 256)
2816 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 v = PyUnicode_New(1, ordinal);
2819 if (v == NULL)
2820 return NULL;
2821 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002822 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002823 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002824}
2825
Alexander Belopolsky40018472011-02-26 01:02:56 +00002826PyObject *
2827PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002829 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002830 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002831 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002832 if (PyUnicode_READY(obj))
2833 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002834 Py_INCREF(obj);
2835 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002836 }
2837 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002838 /* For a Unicode subtype that's not a Unicode object,
2839 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002840 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002841 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002842 PyErr_Format(PyExc_TypeError,
2843 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002844 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002845 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002846}
2847
Alexander Belopolsky40018472011-02-26 01:02:56 +00002848PyObject *
2849PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002850 const char *encoding,
2851 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002852{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002853 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002854 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002855
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002857 PyErr_BadInternalCall();
2858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002860
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002861 /* Decoding bytes objects is the most common case and should be fast */
2862 if (PyBytes_Check(obj)) {
2863 if (PyBytes_GET_SIZE(obj) == 0) {
2864 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002865 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002866 }
2867 else {
2868 v = PyUnicode_Decode(
2869 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2870 encoding, errors);
2871 }
2872 return v;
2873 }
2874
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002875 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002876 PyErr_SetString(PyExc_TypeError,
2877 "decoding str is not supported");
2878 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002879 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002880
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002881 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2882 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2883 PyErr_Format(PyExc_TypeError,
2884 "coercing to str: need bytes, bytearray "
2885 "or buffer-like object, %.80s found",
2886 Py_TYPE(obj)->tp_name);
2887 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002888 }
Tim Petersced69f82003-09-16 20:30:58 +00002889
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002890 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002891 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002892 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 }
Tim Petersced69f82003-09-16 20:30:58 +00002894 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002895 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002896
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002897 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002898 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899}
2900
Victor Stinner600d3be2010-06-10 12:00:55 +00002901/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002902 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2903 1 on success. */
2904static int
2905normalize_encoding(const char *encoding,
2906 char *lower,
2907 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002909 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002910 char *l;
2911 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002912
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002913 if (encoding == NULL) {
2914 strcpy(lower, "utf-8");
2915 return 1;
2916 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002917 e = encoding;
2918 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002919 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002920 while (*e) {
2921 if (l == l_end)
2922 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002923 if (Py_ISUPPER(*e)) {
2924 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002925 }
2926 else if (*e == '_') {
2927 *l++ = '-';
2928 e++;
2929 }
2930 else {
2931 *l++ = *e++;
2932 }
2933 }
2934 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002935 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002936}
2937
Alexander Belopolsky40018472011-02-26 01:02:56 +00002938PyObject *
2939PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002940 Py_ssize_t size,
2941 const char *encoding,
2942 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002943{
2944 PyObject *buffer = NULL, *unicode;
2945 Py_buffer info;
2946 char lower[11]; /* Enough for any encoding shortcut */
2947
Fred Drakee4315f52000-05-09 19:53:39 +00002948 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002949 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002950 if ((strcmp(lower, "utf-8") == 0) ||
2951 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002952 return PyUnicode_DecodeUTF8(s, size, errors);
2953 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002954 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002955 (strcmp(lower, "iso-8859-1") == 0))
2956 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002957#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002958 else if (strcmp(lower, "mbcs") == 0)
2959 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002960#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002961 else if (strcmp(lower, "ascii") == 0)
2962 return PyUnicode_DecodeASCII(s, size, errors);
2963 else if (strcmp(lower, "utf-16") == 0)
2964 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2965 else if (strcmp(lower, "utf-32") == 0)
2966 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968
2969 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002970 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002971 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002972 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002973 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 if (buffer == NULL)
2975 goto onError;
2976 unicode = PyCodec_Decode(buffer, encoding, errors);
2977 if (unicode == NULL)
2978 goto onError;
2979 if (!PyUnicode_Check(unicode)) {
2980 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002981 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002982 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 Py_DECREF(unicode);
2984 goto onError;
2985 }
2986 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002987 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002988
Benjamin Peterson29060642009-01-31 22:14:21 +00002989 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 Py_XDECREF(buffer);
2991 return NULL;
2992}
2993
Alexander Belopolsky40018472011-02-26 01:02:56 +00002994PyObject *
2995PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002996 const char *encoding,
2997 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998{
2999 PyObject *v;
3000
3001 if (!PyUnicode_Check(unicode)) {
3002 PyErr_BadArgument();
3003 goto onError;
3004 }
3005
3006 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003008
3009 /* Decode via the codec registry */
3010 v = PyCodec_Decode(unicode, encoding, errors);
3011 if (v == NULL)
3012 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003013 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003014
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003016 return NULL;
3017}
3018
Alexander Belopolsky40018472011-02-26 01:02:56 +00003019PyObject *
3020PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003021 const char *encoding,
3022 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003023{
3024 PyObject *v;
3025
3026 if (!PyUnicode_Check(unicode)) {
3027 PyErr_BadArgument();
3028 goto onError;
3029 }
3030
3031 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003033
3034 /* Decode via the codec registry */
3035 v = PyCodec_Decode(unicode, encoding, errors);
3036 if (v == NULL)
3037 goto onError;
3038 if (!PyUnicode_Check(v)) {
3039 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003040 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003041 Py_TYPE(v)->tp_name);
3042 Py_DECREF(v);
3043 goto onError;
3044 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003045 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003046
Benjamin Peterson29060642009-01-31 22:14:21 +00003047 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003048 return NULL;
3049}
3050
Alexander Belopolsky40018472011-02-26 01:02:56 +00003051PyObject *
3052PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003053 Py_ssize_t size,
3054 const char *encoding,
3055 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056{
3057 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003058
Guido van Rossumd57fd912000-03-10 22:53:23 +00003059 unicode = PyUnicode_FromUnicode(s, size);
3060 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003061 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003062 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3063 Py_DECREF(unicode);
3064 return v;
3065}
3066
Alexander Belopolsky40018472011-02-26 01:02:56 +00003067PyObject *
3068PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003069 const char *encoding,
3070 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003071{
3072 PyObject *v;
3073
3074 if (!PyUnicode_Check(unicode)) {
3075 PyErr_BadArgument();
3076 goto onError;
3077 }
3078
3079 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003080 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003081
3082 /* Encode via the codec registry */
3083 v = PyCodec_Encode(unicode, encoding, errors);
3084 if (v == NULL)
3085 goto onError;
3086 return v;
3087
Benjamin Peterson29060642009-01-31 22:14:21 +00003088 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003089 return NULL;
3090}
3091
Victor Stinnerad158722010-10-27 00:25:46 +00003092PyObject *
3093PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003094{
Victor Stinner99b95382011-07-04 14:23:54 +02003095#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003096 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003097#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003098 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003099#else
Victor Stinner793b5312011-04-27 00:24:21 +02003100 PyInterpreterState *interp = PyThreadState_GET()->interp;
3101 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3102 cannot use it to encode and decode filenames before it is loaded. Load
3103 the Python codec requires to encode at least its own filename. Use the C
3104 version of the locale codec until the codec registry is initialized and
3105 the Python codec is loaded.
3106
3107 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3108 cannot only rely on it: check also interp->fscodec_initialized for
3109 subinterpreters. */
3110 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003111 return PyUnicode_AsEncodedString(unicode,
3112 Py_FileSystemDefaultEncoding,
3113 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003114 }
3115 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003116 /* locale encoding with surrogateescape */
3117 wchar_t *wchar;
3118 char *bytes;
3119 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003120 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003121
3122 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3123 if (wchar == NULL)
3124 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003125 bytes = _Py_wchar2char(wchar, &error_pos);
3126 if (bytes == NULL) {
3127 if (error_pos != (size_t)-1) {
3128 char *errmsg = strerror(errno);
3129 PyObject *exc = NULL;
3130 if (errmsg == NULL)
3131 errmsg = "Py_wchar2char() failed";
3132 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003133 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003134 error_pos, error_pos+1,
3135 errmsg);
3136 Py_XDECREF(exc);
3137 }
3138 else
3139 PyErr_NoMemory();
3140 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003141 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003142 }
3143 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003144
3145 bytes_obj = PyBytes_FromString(bytes);
3146 PyMem_Free(bytes);
3147 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003148 }
Victor Stinnerad158722010-10-27 00:25:46 +00003149#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003150}
3151
Alexander Belopolsky40018472011-02-26 01:02:56 +00003152PyObject *
3153PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003154 const char *encoding,
3155 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156{
3157 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003158 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003159
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 if (!PyUnicode_Check(unicode)) {
3161 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 }
Fred Drakee4315f52000-05-09 19:53:39 +00003164
Fred Drakee4315f52000-05-09 19:53:39 +00003165 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003166 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003167 if ((strcmp(lower, "utf-8") == 0) ||
3168 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003169 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003170 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003171 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003172 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003173 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003174 }
Victor Stinner37296e82010-06-10 13:36:23 +00003175 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003176 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003177 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003178 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003179#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003180 else if (strcmp(lower, "mbcs") == 0)
3181 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003182#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003183 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003184 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186
3187 /* Encode via the codec registry */
3188 v = PyCodec_Encode(unicode, encoding, errors);
3189 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003190 return NULL;
3191
3192 /* The normal path */
3193 if (PyBytes_Check(v))
3194 return v;
3195
3196 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003197 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003198 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003199 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003200
3201 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3202 "encoder %s returned bytearray instead of bytes",
3203 encoding);
3204 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003205 Py_DECREF(v);
3206 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003207 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003208
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003209 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3210 Py_DECREF(v);
3211 return b;
3212 }
3213
3214 PyErr_Format(PyExc_TypeError,
3215 "encoder did not return a bytes object (type=%.400s)",
3216 Py_TYPE(v)->tp_name);
3217 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003218 return NULL;
3219}
3220
Alexander Belopolsky40018472011-02-26 01:02:56 +00003221PyObject *
3222PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003223 const char *encoding,
3224 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003225{
3226 PyObject *v;
3227
3228 if (!PyUnicode_Check(unicode)) {
3229 PyErr_BadArgument();
3230 goto onError;
3231 }
3232
3233 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003235
3236 /* Encode via the codec registry */
3237 v = PyCodec_Encode(unicode, encoding, errors);
3238 if (v == NULL)
3239 goto onError;
3240 if (!PyUnicode_Check(v)) {
3241 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003242 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003243 Py_TYPE(v)->tp_name);
3244 Py_DECREF(v);
3245 goto onError;
3246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003248
Benjamin Peterson29060642009-01-31 22:14:21 +00003249 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 return NULL;
3251}
3252
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003253PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003254PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003255 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003256 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3257}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003258
Christian Heimes5894ba72007-11-04 11:43:14 +00003259PyObject*
3260PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3261{
Victor Stinner99b95382011-07-04 14:23:54 +02003262#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003263 return PyUnicode_DecodeMBCS(s, size, NULL);
3264#elif defined(__APPLE__)
3265 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3266#else
Victor Stinner793b5312011-04-27 00:24:21 +02003267 PyInterpreterState *interp = PyThreadState_GET()->interp;
3268 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3269 cannot use it to encode and decode filenames before it is loaded. Load
3270 the Python codec requires to encode at least its own filename. Use the C
3271 version of the locale codec until the codec registry is initialized and
3272 the Python codec is loaded.
3273
3274 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3275 cannot only rely on it: check also interp->fscodec_initialized for
3276 subinterpreters. */
3277 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003278 return PyUnicode_Decode(s, size,
3279 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003280 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003281 }
3282 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003283 /* locale encoding with surrogateescape */
3284 wchar_t *wchar;
3285 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003286 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003287
3288 if (s[size] != '\0' || size != strlen(s)) {
3289 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3290 return NULL;
3291 }
3292
Victor Stinner168e1172010-10-16 23:16:16 +00003293 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003294 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003295 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003296
Victor Stinner168e1172010-10-16 23:16:16 +00003297 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003298 PyMem_Free(wchar);
3299 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003300 }
Victor Stinnerad158722010-10-27 00:25:46 +00003301#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003302}
3303
Martin v. Löwis011e8422009-05-05 04:43:17 +00003304
3305int
3306PyUnicode_FSConverter(PyObject* arg, void* addr)
3307{
3308 PyObject *output = NULL;
3309 Py_ssize_t size;
3310 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003311 if (arg == NULL) {
3312 Py_DECREF(*(PyObject**)addr);
3313 return 1;
3314 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003315 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003316 output = arg;
3317 Py_INCREF(output);
3318 }
3319 else {
3320 arg = PyUnicode_FromObject(arg);
3321 if (!arg)
3322 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003323 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003324 Py_DECREF(arg);
3325 if (!output)
3326 return 0;
3327 if (!PyBytes_Check(output)) {
3328 Py_DECREF(output);
3329 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3330 return 0;
3331 }
3332 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003333 size = PyBytes_GET_SIZE(output);
3334 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003335 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003336 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003337 Py_DECREF(output);
3338 return 0;
3339 }
3340 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003341 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003342}
3343
3344
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003345int
3346PyUnicode_FSDecoder(PyObject* arg, void* addr)
3347{
3348 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003349 if (arg == NULL) {
3350 Py_DECREF(*(PyObject**)addr);
3351 return 1;
3352 }
3353 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003354 if (PyUnicode_READY(arg))
3355 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003356 output = arg;
3357 Py_INCREF(output);
3358 }
3359 else {
3360 arg = PyBytes_FromObject(arg);
3361 if (!arg)
3362 return 0;
3363 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3364 PyBytes_GET_SIZE(arg));
3365 Py_DECREF(arg);
3366 if (!output)
3367 return 0;
3368 if (!PyUnicode_Check(output)) {
3369 Py_DECREF(output);
3370 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3371 return 0;
3372 }
3373 }
Victor Stinner065836e2011-10-27 01:56:33 +02003374 if (PyUnicode_READY(output) < 0) {
3375 Py_DECREF(output);
3376 return 0;
3377 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003378 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003379 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003380 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3381 Py_DECREF(output);
3382 return 0;
3383 }
3384 *(PyObject**)addr = output;
3385 return Py_CLEANUP_SUPPORTED;
3386}
3387
3388
Martin v. Löwis5b222132007-06-10 09:51:05 +00003389char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003390PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003391{
Christian Heimesf3863112007-11-22 07:46:41 +00003392 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003393
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003394 if (!PyUnicode_Check(unicode)) {
3395 PyErr_BadArgument();
3396 return NULL;
3397 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003398 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003399 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003400
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003401 if (PyUnicode_UTF8(unicode) == NULL) {
3402 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003403 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3404 if (bytes == NULL)
3405 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003406 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3407 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408 Py_DECREF(bytes);
3409 return NULL;
3410 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003411 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3412 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3413 PyBytes_AS_STRING(bytes),
3414 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003415 Py_DECREF(bytes);
3416 }
3417
3418 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003419 *psize = PyUnicode_UTF8_LENGTH(unicode);
3420 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003421}
3422
3423char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003424PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003426 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3427}
3428
3429#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003430static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003431#endif
3432
3433
3434Py_UNICODE *
3435PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3436{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003437 const unsigned char *one_byte;
3438#if SIZEOF_WCHAR_T == 4
3439 const Py_UCS2 *two_bytes;
3440#else
3441 const Py_UCS4 *four_bytes;
3442 const Py_UCS4 *ucs4_end;
3443 Py_ssize_t num_surrogates;
3444#endif
3445 wchar_t *w;
3446 wchar_t *wchar_end;
3447
3448 if (!PyUnicode_Check(unicode)) {
3449 PyErr_BadArgument();
3450 return NULL;
3451 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003452 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003453 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003454 assert(_PyUnicode_KIND(unicode) != 0);
3455 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003456
3457#ifdef Py_DEBUG
3458 ++unicode_as_unicode_calls;
3459#endif
3460
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003461 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003462#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003463 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3464 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003465 num_surrogates = 0;
3466
3467 for (; four_bytes < ucs4_end; ++four_bytes) {
3468 if (*four_bytes > 0xFFFF)
3469 ++num_surrogates;
3470 }
3471
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003472 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3473 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3474 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003475 PyErr_NoMemory();
3476 return NULL;
3477 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003478 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003480 w = _PyUnicode_WSTR(unicode);
3481 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3482 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003483 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3484 if (*four_bytes > 0xFFFF) {
3485 /* encode surrogate pair in this case */
3486 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3487 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3488 }
3489 else
3490 *w = *four_bytes;
3491
3492 if (w > wchar_end) {
3493 assert(0 && "Miscalculated string end");
3494 }
3495 }
3496 *w = 0;
3497#else
3498 /* sizeof(wchar_t) == 4 */
3499 Py_FatalError("Impossible unicode object state, wstr and str "
3500 "should share memory already.");
3501 return NULL;
3502#endif
3503 }
3504 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003505 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3506 (_PyUnicode_LENGTH(unicode) + 1));
3507 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003508 PyErr_NoMemory();
3509 return NULL;
3510 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003511 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3512 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3513 w = _PyUnicode_WSTR(unicode);
3514 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003515
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003516 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3517 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003518 for (; w < wchar_end; ++one_byte, ++w)
3519 *w = *one_byte;
3520 /* null-terminate the wstr */
3521 *w = 0;
3522 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003523 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003524#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003525 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003526 for (; w < wchar_end; ++two_bytes, ++w)
3527 *w = *two_bytes;
3528 /* null-terminate the wstr */
3529 *w = 0;
3530#else
3531 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003532 PyObject_FREE(_PyUnicode_WSTR(unicode));
3533 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534 Py_FatalError("Impossible unicode object state, wstr "
3535 "and str should share memory already.");
3536 return NULL;
3537#endif
3538 }
3539 else {
3540 assert(0 && "This should never happen.");
3541 }
3542 }
3543 }
3544 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003545 *size = PyUnicode_WSTR_LENGTH(unicode);
3546 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003547}
3548
Alexander Belopolsky40018472011-02-26 01:02:56 +00003549Py_UNICODE *
3550PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553}
3554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003555
Alexander Belopolsky40018472011-02-26 01:02:56 +00003556Py_ssize_t
3557PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003558{
3559 if (!PyUnicode_Check(unicode)) {
3560 PyErr_BadArgument();
3561 goto onError;
3562 }
3563 return PyUnicode_GET_SIZE(unicode);
3564
Benjamin Peterson29060642009-01-31 22:14:21 +00003565 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003566 return -1;
3567}
3568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003569Py_ssize_t
3570PyUnicode_GetLength(PyObject *unicode)
3571{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003572 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003573 PyErr_BadArgument();
3574 return -1;
3575 }
3576
3577 return PyUnicode_GET_LENGTH(unicode);
3578}
3579
3580Py_UCS4
3581PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3582{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003583 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3584 PyErr_BadArgument();
3585 return (Py_UCS4)-1;
3586 }
3587 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3588 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003589 return (Py_UCS4)-1;
3590 }
3591 return PyUnicode_READ_CHAR(unicode, index);
3592}
3593
3594int
3595PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3596{
3597 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003598 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003599 return -1;
3600 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003601 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3602 PyErr_SetString(PyExc_IndexError, "string index out of range");
3603 return -1;
3604 }
3605 if (_PyUnicode_Dirty(unicode))
3606 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003607 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3608 index, ch);
3609 return 0;
3610}
3611
Alexander Belopolsky40018472011-02-26 01:02:56 +00003612const char *
3613PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003614{
Victor Stinner42cb4622010-09-01 19:39:01 +00003615 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003616}
3617
Victor Stinner554f3f02010-06-16 23:33:54 +00003618/* create or adjust a UnicodeDecodeError */
3619static void
3620make_decode_exception(PyObject **exceptionObject,
3621 const char *encoding,
3622 const char *input, Py_ssize_t length,
3623 Py_ssize_t startpos, Py_ssize_t endpos,
3624 const char *reason)
3625{
3626 if (*exceptionObject == NULL) {
3627 *exceptionObject = PyUnicodeDecodeError_Create(
3628 encoding, input, length, startpos, endpos, reason);
3629 }
3630 else {
3631 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3632 goto onError;
3633 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3634 goto onError;
3635 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3636 goto onError;
3637 }
3638 return;
3639
3640onError:
3641 Py_DECREF(*exceptionObject);
3642 *exceptionObject = NULL;
3643}
3644
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645/* error handling callback helper:
3646 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003647 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003648 and adjust various state variables.
3649 return 0 on success, -1 on error
3650*/
3651
Alexander Belopolsky40018472011-02-26 01:02:56 +00003652static int
3653unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003654 const char *encoding, const char *reason,
3655 const char **input, const char **inend, Py_ssize_t *startinpos,
3656 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003657 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003659 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660
3661 PyObject *restuple = NULL;
3662 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003663 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003664 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003665 Py_ssize_t requiredsize;
3666 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003667 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003668 int res = -1;
3669
Victor Stinner596a6c42011-11-09 00:02:18 +01003670 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3671 outsize = PyUnicode_GET_LENGTH(*output);
3672 else
3673 outsize = _PyUnicode_WSTR_LENGTH(*output);
3674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003676 *errorHandler = PyCodec_LookupError(errors);
3677 if (*errorHandler == NULL)
3678 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 }
3680
Victor Stinner554f3f02010-06-16 23:33:54 +00003681 make_decode_exception(exceptionObject,
3682 encoding,
3683 *input, *inend - *input,
3684 *startinpos, *endinpos,
3685 reason);
3686 if (*exceptionObject == NULL)
3687 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003688
3689 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3690 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003691 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003692 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003693 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003694 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 }
3696 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003697 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003698 if (PyUnicode_READY(repunicode) < 0)
3699 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003700
3701 /* Copy back the bytes variables, which might have been modified by the
3702 callback */
3703 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3704 if (!inputobj)
3705 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003706 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003707 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003708 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003709 *input = PyBytes_AS_STRING(inputobj);
3710 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003711 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003712 /* we can DECREF safely, as the exception has another reference,
3713 so the object won't go away. */
3714 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003716 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003717 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003718 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003719 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3720 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003721 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003722
Victor Stinner596a6c42011-11-09 00:02:18 +01003723 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3724 /* need more space? (at least enough for what we
3725 have+the replacement+the rest of the string (starting
3726 at the new input position), so we won't have to check space
3727 when there are no errors in the rest of the string) */
3728 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3729 requiredsize = *outpos + replen + insize-newpos;
3730 if (requiredsize > outsize) {
3731 if (requiredsize<2*outsize)
3732 requiredsize = 2*outsize;
3733 if (unicode_resize(output, requiredsize) < 0)
3734 goto onError;
3735 }
3736 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003737 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003738 copy_characters(*output, *outpos, repunicode, 0, replen);
3739 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003740 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003741 else {
3742 wchar_t *repwstr;
3743 Py_ssize_t repwlen;
3744 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3745 if (repwstr == NULL)
3746 goto onError;
3747 /* need more space? (at least enough for what we
3748 have+the replacement+the rest of the string (starting
3749 at the new input position), so we won't have to check space
3750 when there are no errors in the rest of the string) */
3751 requiredsize = *outpos + repwlen + insize-newpos;
3752 if (requiredsize > outsize) {
3753 if (requiredsize < 2*outsize)
3754 requiredsize = 2*outsize;
3755 if (unicode_resize(output, requiredsize) < 0)
3756 goto onError;
3757 }
3758 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3759 *outpos += repwlen;
3760 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003761 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003762 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003763
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003764 /* we made it! */
3765 res = 0;
3766
Benjamin Peterson29060642009-01-31 22:14:21 +00003767 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003768 Py_XDECREF(restuple);
3769 return res;
3770}
3771
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003772/* --- UTF-7 Codec -------------------------------------------------------- */
3773
Antoine Pitrou244651a2009-05-04 18:56:13 +00003774/* See RFC2152 for details. We encode conservatively and decode liberally. */
3775
3776/* Three simple macros defining base-64. */
3777
3778/* Is c a base-64 character? */
3779
3780#define IS_BASE64(c) \
3781 (((c) >= 'A' && (c) <= 'Z') || \
3782 ((c) >= 'a' && (c) <= 'z') || \
3783 ((c) >= '0' && (c) <= '9') || \
3784 (c) == '+' || (c) == '/')
3785
3786/* given that c is a base-64 character, what is its base-64 value? */
3787
3788#define FROM_BASE64(c) \
3789 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3790 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3791 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3792 (c) == '+' ? 62 : 63)
3793
3794/* What is the base-64 character of the bottom 6 bits of n? */
3795
3796#define TO_BASE64(n) \
3797 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3798
3799/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3800 * decoded as itself. We are permissive on decoding; the only ASCII
3801 * byte not decoding to itself is the + which begins a base64
3802 * string. */
3803
3804#define DECODE_DIRECT(c) \
3805 ((c) <= 127 && (c) != '+')
3806
3807/* The UTF-7 encoder treats ASCII characters differently according to
3808 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3809 * the above). See RFC2152. This array identifies these different
3810 * sets:
3811 * 0 : "Set D"
3812 * alphanumeric and '(),-./:?
3813 * 1 : "Set O"
3814 * !"#$%&*;<=>@[]^_`{|}
3815 * 2 : "whitespace"
3816 * ht nl cr sp
3817 * 3 : special (must be base64 encoded)
3818 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3819 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003820
Tim Petersced69f82003-09-16 20:30:58 +00003821static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003822char utf7_category[128] = {
3823/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3824 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3825/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3826 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3827/* sp ! " # $ % & ' ( ) * + , - . / */
3828 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3829/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3830 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3831/* @ A B C D E F G H I J K L M N O */
3832 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3833/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3834 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3835/* ` a b c d e f g h i j k l m n o */
3836 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3837/* p q r s t u v w x y z { | } ~ del */
3838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003839};
3840
Antoine Pitrou244651a2009-05-04 18:56:13 +00003841/* ENCODE_DIRECT: this character should be encoded as itself. The
3842 * answer depends on whether we are encoding set O as itself, and also
3843 * on whether we are encoding whitespace as itself. RFC2152 makes it
3844 * clear that the answers to these questions vary between
3845 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003846
Antoine Pitrou244651a2009-05-04 18:56:13 +00003847#define ENCODE_DIRECT(c, directO, directWS) \
3848 ((c) < 128 && (c) > 0 && \
3849 ((utf7_category[(c)] == 0) || \
3850 (directWS && (utf7_category[(c)] == 2)) || \
3851 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003852
Alexander Belopolsky40018472011-02-26 01:02:56 +00003853PyObject *
3854PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003855 Py_ssize_t size,
3856 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003857{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003858 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3859}
3860
Antoine Pitrou244651a2009-05-04 18:56:13 +00003861/* The decoder. The only state we preserve is our read position,
3862 * i.e. how many characters we have consumed. So if we end in the
3863 * middle of a shift sequence we have to back off the read position
3864 * and the output to the beginning of the sequence, otherwise we lose
3865 * all the shift state (seen bits, number of bits seen, high
3866 * surrogate). */
3867
Alexander Belopolsky40018472011-02-26 01:02:56 +00003868PyObject *
3869PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003870 Py_ssize_t size,
3871 const char *errors,
3872 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003873{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003874 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003875 Py_ssize_t startinpos;
3876 Py_ssize_t endinpos;
3877 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003878 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003879 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003880 const char *errmsg = "";
3881 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003882 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003883 unsigned int base64bits = 0;
3884 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003885 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003886 PyObject *errorHandler = NULL;
3887 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003888
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003889 /* Start off assuming it's all ASCII. Widen later as necessary. */
3890 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003891 if (!unicode)
3892 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003893 if (size == 0) {
3894 if (consumed)
3895 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003896 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003897 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003898
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003899 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003900 e = s + size;
3901
3902 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003903 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003904 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003905 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003906
Antoine Pitrou244651a2009-05-04 18:56:13 +00003907 if (inShift) { /* in a base-64 section */
3908 if (IS_BASE64(ch)) { /* consume a base-64 character */
3909 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3910 base64bits += 6;
3911 s++;
3912 if (base64bits >= 16) {
3913 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003914 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915 base64bits -= 16;
3916 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3917 if (surrogate) {
3918 /* expecting a second surrogate */
3919 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003920 Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
3921 | (outCh & 0x3FF)) + 0x10000;
3922 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3923 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003925 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003926 }
3927 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003928 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3929 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003931 }
3932 }
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003933 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003934 /* first surrogate */
3935 surrogate = outCh;
3936 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003937 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003938 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3939 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003940 }
3941 }
3942 }
3943 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003944 inShift = 0;
3945 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003946 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003947 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3948 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003949 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003950 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951 if (base64bits > 0) { /* left-over bits */
3952 if (base64bits >= 6) {
3953 /* We've seen at least one base-64 character */
3954 errmsg = "partial character in shift sequence";
3955 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003956 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003957 else {
3958 /* Some bits remain; they should be zero */
3959 if (base64buffer != 0) {
3960 errmsg = "non-zero padding bits in shift sequence";
3961 goto utf7Error;
3962 }
3963 }
3964 }
3965 if (ch != '-') {
3966 /* '-' is absorbed; other terminating
3967 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003968 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3969 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003970 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003971 }
3972 }
3973 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003975 s++; /* consume '+' */
3976 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003977 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003978 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3979 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003980 }
3981 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003982 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003983 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003984 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003985 }
3986 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003987 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003988 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3989 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003990 s++;
3991 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003992 else {
3993 startinpos = s-starts;
3994 s++;
3995 errmsg = "unexpected special character";
3996 goto utf7Error;
3997 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003998 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003999utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000 endinpos = s-starts;
4001 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 errors, &errorHandler,
4003 "utf7", errmsg,
4004 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004005 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004007 }
4008
Antoine Pitrou244651a2009-05-04 18:56:13 +00004009 /* end of string */
4010
4011 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4012 /* if we're in an inconsistent state, that's an error */
4013 if (surrogate ||
4014 (base64bits >= 6) ||
4015 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004016 endinpos = size;
4017 if (unicode_decode_call_errorhandler(
4018 errors, &errorHandler,
4019 "utf7", "unterminated shift sequence",
4020 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004021 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004022 goto onError;
4023 if (s < e)
4024 goto restart;
4025 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004026 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004027
4028 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004029 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004030 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004031 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004032 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004033 }
4034 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004035 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004036 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004037 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004038
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004039 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004040 goto onError;
4041
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004042 Py_XDECREF(errorHandler);
4043 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004044 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004045
Benjamin Peterson29060642009-01-31 22:14:21 +00004046 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 Py_XDECREF(errorHandler);
4048 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004049 Py_DECREF(unicode);
4050 return NULL;
4051}
4052
4053
Alexander Belopolsky40018472011-02-26 01:02:56 +00004054PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004055_PyUnicode_EncodeUTF7(PyObject *str,
4056 int base64SetO,
4057 int base64WhiteSpace,
4058 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004059{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004060 int kind;
4061 void *data;
4062 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004063 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004064 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004065 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004066 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004067 unsigned int base64bits = 0;
4068 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004069 char * out;
4070 char * start;
4071
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004072 if (PyUnicode_READY(str) < 0)
4073 return NULL;
4074 kind = PyUnicode_KIND(str);
4075 data = PyUnicode_DATA(str);
4076 len = PyUnicode_GET_LENGTH(str);
4077
4078 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004079 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004080
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004081 /* It might be possible to tighten this worst case */
4082 allocated = 8 * len;
4083 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004084 return PyErr_NoMemory();
4085
Antoine Pitrou244651a2009-05-04 18:56:13 +00004086 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004087 if (v == NULL)
4088 return NULL;
4089
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004090 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004091 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004092 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004093
Antoine Pitrou244651a2009-05-04 18:56:13 +00004094 if (inShift) {
4095 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4096 /* shifting out */
4097 if (base64bits) { /* output remaining bits */
4098 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4099 base64buffer = 0;
4100 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004101 }
4102 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004103 /* Characters not in the BASE64 set implicitly unshift the sequence
4104 so no '-' is required, except if the character is itself a '-' */
4105 if (IS_BASE64(ch) || ch == '-') {
4106 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004107 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004108 *out++ = (char) ch;
4109 }
4110 else {
4111 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004112 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004113 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004114 else { /* not in a shift sequence */
4115 if (ch == '+') {
4116 *out++ = '+';
4117 *out++ = '-';
4118 }
4119 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4120 *out++ = (char) ch;
4121 }
4122 else {
4123 *out++ = '+';
4124 inShift = 1;
4125 goto encode_char;
4126 }
4127 }
4128 continue;
4129encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004130 if (ch >= 0x10000) {
4131 /* code first surrogate */
4132 base64bits += 16;
4133 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4134 while (base64bits >= 6) {
4135 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4136 base64bits -= 6;
4137 }
4138 /* prepare second surrogate */
4139 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4140 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004141 base64bits += 16;
4142 base64buffer = (base64buffer << 16) | ch;
4143 while (base64bits >= 6) {
4144 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4145 base64bits -= 6;
4146 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004147 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004148 if (base64bits)
4149 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4150 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004151 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004152 if (_PyBytes_Resize(&v, out - start) < 0)
4153 return NULL;
4154 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004155}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004156PyObject *
4157PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4158 Py_ssize_t size,
4159 int base64SetO,
4160 int base64WhiteSpace,
4161 const char *errors)
4162{
4163 PyObject *result;
4164 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4165 if (tmp == NULL)
4166 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004167 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004168 base64WhiteSpace, errors);
4169 Py_DECREF(tmp);
4170 return result;
4171}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004172
Antoine Pitrou244651a2009-05-04 18:56:13 +00004173#undef IS_BASE64
4174#undef FROM_BASE64
4175#undef TO_BASE64
4176#undef DECODE_DIRECT
4177#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004178
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179/* --- UTF-8 Codec -------------------------------------------------------- */
4180
Tim Petersced69f82003-09-16 20:30:58 +00004181static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004183 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4184 illegal prefix. See RFC 3629 for details */
4185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4197 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4198 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4199 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4200 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201};
4202
Alexander Belopolsky40018472011-02-26 01:02:56 +00004203PyObject *
4204PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004205 Py_ssize_t size,
4206 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207{
Walter Dörwald69652032004-09-07 20:24:22 +00004208 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4209}
4210
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004211#include "stringlib/ucs1lib.h"
4212#include "stringlib/codecs.h"
4213#include "stringlib/undef.h"
4214
4215#include "stringlib/ucs2lib.h"
4216#include "stringlib/codecs.h"
4217#include "stringlib/undef.h"
4218
4219#include "stringlib/ucs4lib.h"
4220#include "stringlib/codecs.h"
4221#include "stringlib/undef.h"
4222
Antoine Pitrouab868312009-01-10 15:40:25 +00004223/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4224#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4225
4226/* Mask to quickly check whether a C 'long' contains a
4227 non-ASCII, UTF8-encoded char. */
4228#if (SIZEOF_LONG == 8)
4229# define ASCII_CHAR_MASK 0x8080808080808080L
4230#elif (SIZEOF_LONG == 4)
4231# define ASCII_CHAR_MASK 0x80808080L
4232#else
4233# error C 'long' size should be either 4 or 8!
4234#endif
4235
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004236/* Scans a UTF-8 string and returns the maximum character to be expected
4237 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004239 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004240 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004241 */
4242static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004243utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4244 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004247 const unsigned char *p = (const unsigned char *)s;
4248 const unsigned char *end = p + string_size;
4249 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004250
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004251 assert(unicode_size != NULL);
4252
4253 /* By having a cascade of independent loops which fallback onto each
4254 other, we minimize the amount of work done in the average loop
4255 iteration, and we also maximize the CPU's ability to predict
4256 branches correctly (because a given condition will have always the
4257 same boolean outcome except perhaps in the last iteration of the
4258 corresponding loop).
4259 In the general case this brings us rather close to decoding
4260 performance pre-PEP 393, despite the two-pass decoding.
4261
4262 Note that the pure ASCII loop is not duplicated once a non-ASCII
4263 character has been encountered. It is actually a pessimization (by
4264 a significant factor) to use this loop on text with many non-ASCII
4265 characters, and it is important to avoid bad performance on valid
4266 utf-8 data (invalid utf-8 being a different can of worms).
4267 */
4268
4269 /* ASCII */
4270 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004271 /* Only check value if it's not a ASCII char... */
4272 if (*p < 0x80) {
4273 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4274 an explanation. */
4275 if (!((size_t) p & LONG_PTR_MASK)) {
4276 /* Help register allocation */
4277 register const unsigned char *_p = p;
4278 while (_p < aligned_end) {
4279 unsigned long value = *(unsigned long *) _p;
4280 if (value & ASCII_CHAR_MASK)
4281 break;
4282 _p += SIZEOF_LONG;
4283 char_count += SIZEOF_LONG;
4284 }
4285 p = _p;
4286 if (p == end)
4287 break;
4288 }
4289 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004290 if (*p < 0x80)
4291 ++char_count;
4292 else
4293 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004295 *unicode_size = char_count;
4296 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004297
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004298_ucs1loop:
4299 for (; p < end; ++p) {
4300 if (*p < 0xc4)
4301 char_count += ((*p & 0xc0) != 0x80);
4302 else
4303 goto _ucs2loop;
4304 }
4305 *unicode_size = char_count;
4306 return 255;
4307
4308_ucs2loop:
4309 for (; p < end; ++p) {
4310 if (*p < 0xf0)
4311 char_count += ((*p & 0xc0) != 0x80);
4312 else
4313 goto _ucs4loop;
4314 }
4315 *unicode_size = char_count;
4316 return 65535;
4317
4318_ucs4loop:
4319 for (; p < end; ++p) {
4320 char_count += ((*p & 0xc0) != 0x80);
4321 }
4322 *unicode_size = char_count;
4323 return 65537;
4324}
4325
4326/* Called when we encountered some error that wasn't detected in the original
4327 scan, e.g. an encoded surrogate character. The original maxchar computation
4328 may have been incorrect, so redo it. */
4329static int
4330refit_partial_string(PyObject **unicode, int kind, void *data, Py_ssize_t n)
4331{
4332 PyObject *tmp;
Victor Stinnerf8facac2011-11-22 02:30:47 +01004333 Py_ssize_t k;
4334 Py_UCS4 maxchar;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004335 for (k = 0, maxchar = 0; k < n; k++)
4336 maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
4337 tmp = PyUnicode_New(PyUnicode_GET_LENGTH(*unicode), maxchar);
4338 if (tmp == NULL)
4339 return -1;
4340 PyUnicode_CopyCharacters(tmp, 0, *unicode, 0, n);
4341 Py_DECREF(*unicode);
4342 *unicode = tmp;
4343 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004344}
4345
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004346/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
4347 in case of errors. Implicit parameters: unicode, kind, data, has_errors,
4348 onError. Potential resizing overallocates, so the result needs to shrink
4349 at the end.
4350*/
4351#define WRITE_MAYBE_FAIL(index, value) \
4352 do { \
4353 if (has_errors) { \
4354 Py_ssize_t pos = index; \
4355 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4356 unicode_resize(&unicode, pos + pos/8) < 0) \
4357 goto onError; \
4358 if (unicode_putchar(&unicode, &pos, value) < 0) \
4359 goto onError; \
4360 } \
4361 else \
4362 PyUnicode_WRITE(kind, data, index, value); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004363 } while (0)
4364
Alexander Belopolsky40018472011-02-26 01:02:56 +00004365PyObject *
4366PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004367 Py_ssize_t size,
4368 const char *errors,
4369 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004370{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004371 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004373 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004374 Py_ssize_t startinpos;
4375 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004376 const char *e, *aligned_end;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004377 PyObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004378 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004379 PyObject *errorHandler = NULL;
4380 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004381 Py_UCS4 maxchar = 0;
4382 Py_ssize_t unicode_size;
4383 Py_ssize_t i;
4384 int kind;
4385 void *data;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004386 int has_errors = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387
Walter Dörwald69652032004-09-07 20:24:22 +00004388 if (size == 0) {
4389 if (consumed)
4390 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004391 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004392 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004393 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004394 /* When the string is ASCII only, just use memcpy and return.
4395 unicode_size may be != size if there is an incomplete UTF-8
4396 sequence at the end of the ASCII block. */
4397 if (maxchar < 128 && size == unicode_size) {
Victor Stinner42885202011-11-22 01:23:02 +01004398 if (consumed)
4399 *consumed = size;
4400
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004401 if (size == 1)
4402 return get_latin1_char((unsigned char)s[0]);
4403
4404 unicode = PyUnicode_New(unicode_size, maxchar);
4405 if (!unicode)
4406 return NULL;
4407 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4408 assert(_PyUnicode_CheckConsistency(unicode, 1));
4409 return unicode;
4410 }
4411
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004412 /* In case of errors, maxchar and size computation might be incorrect;
4413 code below refits and resizes as necessary. */
4414 unicode = PyUnicode_New(unicode_size, maxchar);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004415 if (!unicode)
4416 return NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004417 kind = PyUnicode_KIND(unicode);
4418 data = PyUnicode_DATA(unicode);
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004419
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004421 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 e = s + size;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004423 switch (kind) {
4424 case PyUnicode_1BYTE_KIND:
4425 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4426 break;
4427 case PyUnicode_2BYTE_KIND:
4428 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4429 break;
4430 case PyUnicode_4BYTE_KIND:
4431 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4432 break;
4433 }
4434 if (!has_errors) {
4435 /* Ensure the unicode size calculation was correct */
4436 assert(i == unicode_size);
4437 assert(s == e);
4438 if (consumed)
4439 *consumed = s-starts;
4440 return unicode;
4441 }
4442 /* Fall through to the generic decoding loop for the rest of
4443 the string */
4444 if (refit_partial_string(&unicode, kind, data, i) < 0)
4445 goto onError;
4446
Antoine Pitrouab868312009-01-10 15:40:25 +00004447 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448
4449 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004450 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451
4452 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004453 /* Fast path for runs of ASCII characters. Given that common UTF-8
4454 input will consist of an overwhelming majority of ASCII
4455 characters, we try to optimize for this case by checking
4456 as many characters as a C 'long' can contain.
4457 First, check if we can do an aligned read, as most CPUs have
4458 a penalty for unaligned reads.
4459 */
4460 if (!((size_t) s & LONG_PTR_MASK)) {
4461 /* Help register allocation */
4462 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004463 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004464 while (_s < aligned_end) {
4465 /* Read a whole long at a time (either 4 or 8 bytes),
4466 and do a fast unrolled copy if it only contains ASCII
4467 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004468 unsigned long value = *(unsigned long *) _s;
4469 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004470 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004471 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4472 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4473 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4474 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004475#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004476 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4477 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4478 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4479 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004480#endif
4481 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004482 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004483 }
4484 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004485 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004486 if (s == e)
4487 break;
4488 ch = (unsigned char)*s;
4489 }
4490 }
4491
4492 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004493 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 s++;
4495 continue;
4496 }
4497
4498 n = utf8_code_length[ch];
4499
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004500 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004501 if (consumed)
4502 break;
4503 else {
4504 errmsg = "unexpected end of data";
4505 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004506 endinpos = startinpos+1;
4507 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4508 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 goto utf8Error;
4510 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004511 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004512
4513 switch (n) {
4514
4515 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004516 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 startinpos = s-starts;
4518 endinpos = startinpos+1;
4519 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520
4521 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004522 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 startinpos = s-starts;
4524 endinpos = startinpos+1;
4525 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526
4527 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004528 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004529 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004531 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004532 goto utf8Error;
4533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004535 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004536 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 break;
4538
4539 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004540 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4541 will result in surrogates in range d800-dfff. Surrogates are
4542 not valid UTF-8 so they are rejected.
4543 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4544 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004545 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004546 (s[2] & 0xc0) != 0x80 ||
4547 ((unsigned char)s[0] == 0xE0 &&
4548 (unsigned char)s[1] < 0xA0) ||
4549 ((unsigned char)s[0] == 0xED &&
4550 (unsigned char)s[1] > 0x9F)) {
4551 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004552 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004553 endinpos = startinpos + 1;
4554
4555 /* if s[1] first two bits are 1 and 0, then the invalid
4556 continuation byte is s[2], so increment endinpos by 1,
4557 if not, s[1] is invalid and endinpos doesn't need to
4558 be incremented. */
4559 if ((s[1] & 0xC0) == 0x80)
4560 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004561 goto utf8Error;
4562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004564 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004565 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004566 break;
4567
4568 case 4:
4569 if ((s[1] & 0xc0) != 0x80 ||
4570 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004571 (s[3] & 0xc0) != 0x80 ||
4572 ((unsigned char)s[0] == 0xF0 &&
4573 (unsigned char)s[1] < 0x90) ||
4574 ((unsigned char)s[0] == 0xF4 &&
4575 (unsigned char)s[1] > 0x8F)) {
4576 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004577 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004578 endinpos = startinpos + 1;
4579 if ((s[1] & 0xC0) == 0x80) {
4580 endinpos++;
4581 if ((s[2] & 0xC0) == 0x80)
4582 endinpos++;
4583 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004584 goto utf8Error;
4585 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004586 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004587 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4588 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4589
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004590 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 }
4593 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004594 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004595
Benjamin Peterson29060642009-01-31 22:14:21 +00004596 utf8Error:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004597 if (!has_errors) {
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004598 if (refit_partial_string(&unicode, kind, data, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004599 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004600 has_errors = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004602 if (unicode_decode_call_errorhandler(
4603 errors, &errorHandler,
4604 "utf8", errmsg,
4605 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004606 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004607 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004608 /* Update data because unicode_decode_call_errorhandler might have
4609 re-created or resized the unicode object. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004610 data = PyUnicode_DATA(unicode);
4611 kind = PyUnicode_KIND(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004612 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004614 /* Ensure the unicode_size calculation above was correct: */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004615 assert(has_errors || i == unicode_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616
Walter Dörwald69652032004-09-07 20:24:22 +00004617 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004618 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004620 /* Adjust length and ready string when it contained errors and
4621 is of the old resizable kind. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004622 if (has_errors) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01004623 if (PyUnicode_Resize(&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004624 goto onError;
4625 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004627 Py_XDECREF(errorHandler);
4628 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004629 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004630 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631
Benjamin Peterson29060642009-01-31 22:14:21 +00004632 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 Py_XDECREF(errorHandler);
4634 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 Py_DECREF(unicode);
4636 return NULL;
4637}
4638
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004639#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004640
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004641#ifdef __APPLE__
4642
4643/* Simplified UTF-8 decoder using surrogateescape error handler,
4644 used to decode the command line arguments on Mac OS X. */
4645
4646wchar_t*
4647_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4648{
4649 int n;
4650 const char *e;
4651 wchar_t *unicode, *p;
4652
4653 /* Note: size will always be longer than the resulting Unicode
4654 character count */
4655 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4656 PyErr_NoMemory();
4657 return NULL;
4658 }
4659 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4660 if (!unicode)
4661 return NULL;
4662
4663 /* Unpack UTF-8 encoded data */
4664 p = unicode;
4665 e = s + size;
4666 while (s < e) {
4667 Py_UCS4 ch = (unsigned char)*s;
4668
4669 if (ch < 0x80) {
4670 *p++ = (wchar_t)ch;
4671 s++;
4672 continue;
4673 }
4674
4675 n = utf8_code_length[ch];
4676 if (s + n > e) {
4677 goto surrogateescape;
4678 }
4679
4680 switch (n) {
4681 case 0:
4682 case 1:
4683 goto surrogateescape;
4684
4685 case 2:
4686 if ((s[1] & 0xc0) != 0x80)
4687 goto surrogateescape;
4688 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4689 assert ((ch > 0x007F) && (ch <= 0x07FF));
4690 *p++ = (wchar_t)ch;
4691 break;
4692
4693 case 3:
4694 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4695 will result in surrogates in range d800-dfff. Surrogates are
4696 not valid UTF-8 so they are rejected.
4697 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4698 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4699 if ((s[1] & 0xc0) != 0x80 ||
4700 (s[2] & 0xc0) != 0x80 ||
4701 ((unsigned char)s[0] == 0xE0 &&
4702 (unsigned char)s[1] < 0xA0) ||
4703 ((unsigned char)s[0] == 0xED &&
4704 (unsigned char)s[1] > 0x9F)) {
4705
4706 goto surrogateescape;
4707 }
4708 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4709 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004710 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004711 break;
4712
4713 case 4:
4714 if ((s[1] & 0xc0) != 0x80 ||
4715 (s[2] & 0xc0) != 0x80 ||
4716 (s[3] & 0xc0) != 0x80 ||
4717 ((unsigned char)s[0] == 0xF0 &&
4718 (unsigned char)s[1] < 0x90) ||
4719 ((unsigned char)s[0] == 0xF4 &&
4720 (unsigned char)s[1] > 0x8F)) {
4721 goto surrogateescape;
4722 }
4723 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4724 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4725 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4726
4727#if SIZEOF_WCHAR_T == 4
4728 *p++ = (wchar_t)ch;
4729#else
4730 /* compute and append the two surrogates: */
4731
4732 /* translate from 10000..10FFFF to 0..FFFF */
4733 ch -= 0x10000;
4734
4735 /* high surrogate = top 10 bits added to D800 */
4736 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4737
4738 /* low surrogate = bottom 10 bits added to DC00 */
4739 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4740#endif
4741 break;
4742 }
4743 s += n;
4744 continue;
4745
4746 surrogateescape:
4747 *p++ = 0xDC00 + ch;
4748 s++;
4749 }
4750 *p = L'\0';
4751 return unicode;
4752}
4753
4754#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004756/* Primary internal function which creates utf8 encoded bytes objects.
4757
4758 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004759 and allocate exactly as much space needed at the end. Else allocate the
4760 maximum possible needed (4 result bytes per Unicode character), and return
4761 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004762*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004763PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004764_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765{
Tim Peters602f7402002-04-27 18:03:26 +00004766#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004767
Guido van Rossum98297ee2007-11-06 21:34:58 +00004768 Py_ssize_t i; /* index into s of next input byte */
4769 PyObject *result; /* result string object */
4770 char *p; /* next free byte in output buffer */
4771 Py_ssize_t nallocated; /* number of result bytes allocated */
4772 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004773 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004774 PyObject *errorHandler = NULL;
4775 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004776 int kind;
4777 void *data;
4778 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004779 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004781 if (!PyUnicode_Check(unicode)) {
4782 PyErr_BadArgument();
4783 return NULL;
4784 }
4785
4786 if (PyUnicode_READY(unicode) == -1)
4787 return NULL;
4788
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004789 if (PyUnicode_UTF8(unicode))
4790 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4791 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004792
4793 kind = PyUnicode_KIND(unicode);
4794 data = PyUnicode_DATA(unicode);
4795 size = PyUnicode_GET_LENGTH(unicode);
4796
Tim Peters602f7402002-04-27 18:03:26 +00004797 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798
Tim Peters602f7402002-04-27 18:03:26 +00004799 if (size <= MAX_SHORT_UNICHARS) {
4800 /* Write into the stack buffer; nallocated can't overflow.
4801 * At the end, we'll allocate exactly as much heap space as it
4802 * turns out we need.
4803 */
4804 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004805 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004806 p = stackbuf;
4807 }
4808 else {
4809 /* Overallocate on the heap, and give the excess back at the end. */
4810 nallocated = size * 4;
4811 if (nallocated / 4 != size) /* overflow! */
4812 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004813 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004814 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004815 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004816 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004817 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004818
Tim Peters602f7402002-04-27 18:03:26 +00004819 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004820 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004821
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004822 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004823 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004825
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004827 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004828 *p++ = (char)(0xc0 | (ch >> 6));
4829 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004830 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004831 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004832 Py_ssize_t repsize, k, startpos;
4833 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004834 rep = unicode_encode_call_errorhandler(
4835 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004836 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004837 if (!rep)
4838 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004840 if (PyBytes_Check(rep))
4841 repsize = PyBytes_GET_SIZE(rep);
4842 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004843 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004844
4845 if (repsize > 4) {
4846 Py_ssize_t offset;
4847
4848 if (result == NULL)
4849 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004850 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004851 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004853 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4854 /* integer overflow */
4855 PyErr_NoMemory();
4856 goto error;
4857 }
4858 nallocated += repsize - 4;
4859 if (result != NULL) {
4860 if (_PyBytes_Resize(&result, nallocated) < 0)
4861 goto error;
4862 } else {
4863 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004864 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004865 goto error;
4866 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4867 }
4868 p = PyBytes_AS_STRING(result) + offset;
4869 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004871 if (PyBytes_Check(rep)) {
4872 char *prep = PyBytes_AS_STRING(rep);
4873 for(k = repsize; k > 0; k--)
4874 *p++ = *prep++;
4875 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004876 enum PyUnicode_Kind repkind;
4877 void *repdata;
4878
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004879 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004880 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004881 repkind = PyUnicode_KIND(rep);
4882 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004883
4884 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004885 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004886 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004887 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004888 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004889 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004891 goto error;
4892 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004893 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004894 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004895 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004896 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004897 } else if (ch < 0x10000) {
4898 *p++ = (char)(0xe0 | (ch >> 12));
4899 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4900 *p++ = (char)(0x80 | (ch & 0x3f));
4901 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004902 /* Encode UCS4 Unicode ordinals */
4903 *p++ = (char)(0xf0 | (ch >> 18));
4904 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4905 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4906 *p++ = (char)(0x80 | (ch & 0x3f));
4907 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004909
Guido van Rossum98297ee2007-11-06 21:34:58 +00004910 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004911 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004912 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004913 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004914 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004915 }
4916 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004917 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004918 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004919 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004920 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004921 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004922
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004923 Py_XDECREF(errorHandler);
4924 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004925 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004926 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004927 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004928 Py_XDECREF(errorHandler);
4929 Py_XDECREF(exc);
4930 Py_XDECREF(result);
4931 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004932
Tim Peters602f7402002-04-27 18:03:26 +00004933#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934}
4935
Alexander Belopolsky40018472011-02-26 01:02:56 +00004936PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004937PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4938 Py_ssize_t size,
4939 const char *errors)
4940{
4941 PyObject *v, *unicode;
4942
4943 unicode = PyUnicode_FromUnicode(s, size);
4944 if (unicode == NULL)
4945 return NULL;
4946 v = _PyUnicode_AsUTF8String(unicode, errors);
4947 Py_DECREF(unicode);
4948 return v;
4949}
4950
4951PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004952PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004954 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955}
4956
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957/* --- UTF-32 Codec ------------------------------------------------------- */
4958
4959PyObject *
4960PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004961 Py_ssize_t size,
4962 const char *errors,
4963 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004964{
4965 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4966}
4967
4968PyObject *
4969PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 Py_ssize_t size,
4971 const char *errors,
4972 int *byteorder,
4973 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004974{
4975 const char *starts = s;
4976 Py_ssize_t startinpos;
4977 Py_ssize_t endinpos;
4978 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004979 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004980 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004981 int bo = 0; /* assume native ordering by default */
4982 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004983 /* Offsets from q for retrieving bytes in the right order. */
4984#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4985 int iorder[] = {0, 1, 2, 3};
4986#else
4987 int iorder[] = {3, 2, 1, 0};
4988#endif
4989 PyObject *errorHandler = NULL;
4990 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004991
Walter Dörwald41980ca2007-08-16 21:55:45 +00004992 q = (unsigned char *)s;
4993 e = q + size;
4994
4995 if (byteorder)
4996 bo = *byteorder;
4997
4998 /* Check for BOM marks (U+FEFF) in the input and adjust current
4999 byte order setting accordingly. In native mode, the leading BOM
5000 mark is skipped, in all other modes, it is copied to the output
5001 stream as-is (giving a ZWNBSP character). */
5002 if (bo == 0) {
5003 if (size >= 4) {
5004 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 if (bom == 0x0000FEFF) {
5008 q += 4;
5009 bo = -1;
5010 }
5011 else if (bom == 0xFFFE0000) {
5012 q += 4;
5013 bo = 1;
5014 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005015#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 if (bom == 0x0000FEFF) {
5017 q += 4;
5018 bo = 1;
5019 }
5020 else if (bom == 0xFFFE0000) {
5021 q += 4;
5022 bo = -1;
5023 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005024#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005026 }
5027
5028 if (bo == -1) {
5029 /* force LE */
5030 iorder[0] = 0;
5031 iorder[1] = 1;
5032 iorder[2] = 2;
5033 iorder[3] = 3;
5034 }
5035 else if (bo == 1) {
5036 /* force BE */
5037 iorder[0] = 3;
5038 iorder[1] = 2;
5039 iorder[2] = 1;
5040 iorder[3] = 0;
5041 }
5042
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005043 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005044 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005045 if (!unicode)
5046 return NULL;
5047 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005048 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005049 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005050
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 Py_UCS4 ch;
5053 /* remaining bytes at the end? (size should be divisible by 4) */
5054 if (e-q<4) {
5055 if (consumed)
5056 break;
5057 errmsg = "truncated data";
5058 startinpos = ((const char *)q)-starts;
5059 endinpos = ((const char *)e)-starts;
5060 goto utf32Error;
5061 /* The remaining input chars are ignored if the callback
5062 chooses to skip the input */
5063 }
5064 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5065 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005066
Benjamin Peterson29060642009-01-31 22:14:21 +00005067 if (ch >= 0x110000)
5068 {
5069 errmsg = "codepoint not in range(0x110000)";
5070 startinpos = ((const char *)q)-starts;
5071 endinpos = startinpos+4;
5072 goto utf32Error;
5073 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005074 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5075 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 q += 4;
5077 continue;
5078 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 if (unicode_decode_call_errorhandler(
5080 errors, &errorHandler,
5081 "utf32", errmsg,
5082 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005083 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005084 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005085 }
5086
5087 if (byteorder)
5088 *byteorder = bo;
5089
5090 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005091 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092
5093 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005094 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005095 goto onError;
5096
5097 Py_XDECREF(errorHandler);
5098 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005099 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100
Benjamin Peterson29060642009-01-31 22:14:21 +00005101 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005102 Py_DECREF(unicode);
5103 Py_XDECREF(errorHandler);
5104 Py_XDECREF(exc);
5105 return NULL;
5106}
5107
5108PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005109_PyUnicode_EncodeUTF32(PyObject *str,
5110 const char *errors,
5111 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005113 int kind;
5114 void *data;
5115 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005116 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005118 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005119 /* Offsets from p for storing byte pairs in the right order. */
5120#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5121 int iorder[] = {0, 1, 2, 3};
5122#else
5123 int iorder[] = {3, 2, 1, 0};
5124#endif
5125
Benjamin Peterson29060642009-01-31 22:14:21 +00005126#define STORECHAR(CH) \
5127 do { \
5128 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5129 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5130 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5131 p[iorder[0]] = (CH) & 0xff; \
5132 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005133 } while(0)
5134
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005135 if (!PyUnicode_Check(str)) {
5136 PyErr_BadArgument();
5137 return NULL;
5138 }
5139 if (PyUnicode_READY(str) < 0)
5140 return NULL;
5141 kind = PyUnicode_KIND(str);
5142 data = PyUnicode_DATA(str);
5143 len = PyUnicode_GET_LENGTH(str);
5144
5145 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005146 bytesize = nsize * 4;
5147 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005149 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005150 if (v == NULL)
5151 return NULL;
5152
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005153 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005154 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005155 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005156 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005157 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005158
5159 if (byteorder == -1) {
5160 /* force LE */
5161 iorder[0] = 0;
5162 iorder[1] = 1;
5163 iorder[2] = 2;
5164 iorder[3] = 3;
5165 }
5166 else if (byteorder == 1) {
5167 /* force BE */
5168 iorder[0] = 3;
5169 iorder[1] = 2;
5170 iorder[2] = 1;
5171 iorder[3] = 0;
5172 }
5173
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005174 for (i = 0; i < len; i++)
5175 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005176
5177 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005178 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005179#undef STORECHAR
5180}
5181
Alexander Belopolsky40018472011-02-26 01:02:56 +00005182PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005183PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5184 Py_ssize_t size,
5185 const char *errors,
5186 int byteorder)
5187{
5188 PyObject *result;
5189 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5190 if (tmp == NULL)
5191 return NULL;
5192 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5193 Py_DECREF(tmp);
5194 return result;
5195}
5196
5197PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005198PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005199{
Victor Stinnerb960b342011-11-20 19:12:52 +01005200 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005201}
5202
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203/* --- UTF-16 Codec ------------------------------------------------------- */
5204
Tim Peters772747b2001-08-09 22:21:55 +00005205PyObject *
5206PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005207 Py_ssize_t size,
5208 const char *errors,
5209 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210{
Walter Dörwald69652032004-09-07 20:24:22 +00005211 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5212}
5213
Antoine Pitrouab868312009-01-10 15:40:25 +00005214/* Two masks for fast checking of whether a C 'long' may contain
5215 UTF16-encoded surrogate characters. This is an efficient heuristic,
5216 assuming that non-surrogate characters with a code point >= 0x8000 are
5217 rare in most input.
5218 FAST_CHAR_MASK is used when the input is in native byte ordering,
5219 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005220*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005221#if (SIZEOF_LONG == 8)
5222# define FAST_CHAR_MASK 0x8000800080008000L
5223# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5224#elif (SIZEOF_LONG == 4)
5225# define FAST_CHAR_MASK 0x80008000L
5226# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5227#else
5228# error C 'long' size should be either 4 or 8!
5229#endif
5230
Walter Dörwald69652032004-09-07 20:24:22 +00005231PyObject *
5232PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 Py_ssize_t size,
5234 const char *errors,
5235 int *byteorder,
5236 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005237{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005239 Py_ssize_t startinpos;
5240 Py_ssize_t endinpos;
5241 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005242 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005243 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005244 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005245 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005246 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005247 /* Offsets from q for retrieving byte pairs in the right order. */
5248#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5249 int ihi = 1, ilo = 0;
5250#else
5251 int ihi = 0, ilo = 1;
5252#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005253 PyObject *errorHandler = NULL;
5254 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
5256 /* Note: size will always be longer than the resulting Unicode
5257 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005258 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 if (!unicode)
5260 return NULL;
5261 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005262 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005263 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264
Tim Peters772747b2001-08-09 22:21:55 +00005265 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005266 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267
5268 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005269 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005271 /* Check for BOM marks (U+FEFF) in the input and adjust current
5272 byte order setting accordingly. In native mode, the leading BOM
5273 mark is skipped, in all other modes, it is copied to the output
5274 stream as-is (giving a ZWNBSP character). */
5275 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005276 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005277 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005278#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 if (bom == 0xFEFF) {
5280 q += 2;
5281 bo = -1;
5282 }
5283 else if (bom == 0xFFFE) {
5284 q += 2;
5285 bo = 1;
5286 }
Tim Petersced69f82003-09-16 20:30:58 +00005287#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005288 if (bom == 0xFEFF) {
5289 q += 2;
5290 bo = 1;
5291 }
5292 else if (bom == 0xFFFE) {
5293 q += 2;
5294 bo = -1;
5295 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005296#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005297 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005298 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299
Tim Peters772747b2001-08-09 22:21:55 +00005300 if (bo == -1) {
5301 /* force LE */
5302 ihi = 1;
5303 ilo = 0;
5304 }
5305 else if (bo == 1) {
5306 /* force BE */
5307 ihi = 0;
5308 ilo = 1;
5309 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005310#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5311 native_ordering = ilo < ihi;
5312#else
5313 native_ordering = ilo > ihi;
5314#endif
Tim Peters772747b2001-08-09 22:21:55 +00005315
Antoine Pitrouab868312009-01-10 15:40:25 +00005316 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005317 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005318 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005319 /* First check for possible aligned read of a C 'long'. Unaligned
5320 reads are more expensive, better to defer to another iteration. */
5321 if (!((size_t) q & LONG_PTR_MASK)) {
5322 /* Fast path for runs of non-surrogate chars. */
5323 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005324 int kind = PyUnicode_KIND(unicode);
5325 void *data = PyUnicode_DATA(unicode);
5326 while (_q < aligned_end) {
5327 unsigned long block = * (unsigned long *) _q;
5328 unsigned short *pblock = (unsigned short*)&block;
5329 Py_UCS4 maxch;
5330 if (native_ordering) {
5331 /* Can use buffer directly */
5332 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005333 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005334 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005335 else {
5336 /* Need to byte-swap */
5337 unsigned char *_p = (unsigned char*)pblock;
5338 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005339 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005340 _p[0] = _q[1];
5341 _p[1] = _q[0];
5342 _p[2] = _q[3];
5343 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005344#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005345 _p[4] = _q[5];
5346 _p[5] = _q[4];
5347 _p[6] = _q[7];
5348 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005349#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005350 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005351 maxch = Py_MAX(pblock[0], pblock[1]);
5352#if SIZEOF_LONG == 8
5353 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5354#endif
5355 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5356 if (unicode_widen(&unicode, maxch) < 0)
5357 goto onError;
5358 kind = PyUnicode_KIND(unicode);
5359 data = PyUnicode_DATA(unicode);
5360 }
5361 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5362 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5363#if SIZEOF_LONG == 8
5364 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5365 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5366#endif
5367 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005368 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005369 q = _q;
5370 if (q >= e)
5371 break;
5372 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005374
Benjamin Peterson14339b62009-01-31 16:36:08 +00005375 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005376
5377 if (ch < 0xD800 || ch > 0xDFFF) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005378 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5379 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 continue;
5381 }
5382
5383 /* UTF-16 code pair: */
5384 if (q > e) {
5385 errmsg = "unexpected end of data";
5386 startinpos = (((const char *)q) - 2) - starts;
5387 endinpos = ((const char *)e) + 1 - starts;
5388 goto utf16Error;
5389 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005390 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5391 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005393 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005394 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005395 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005396 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005397 continue;
5398 }
5399 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005400 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005401 startinpos = (((const char *)q)-4)-starts;
5402 endinpos = startinpos+2;
5403 goto utf16Error;
5404 }
5405
Benjamin Peterson14339b62009-01-31 16:36:08 +00005406 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 errmsg = "illegal encoding";
5408 startinpos = (((const char *)q)-2)-starts;
5409 endinpos = startinpos+2;
5410 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005411
Benjamin Peterson29060642009-01-31 22:14:21 +00005412 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005414 errors,
5415 &errorHandler,
5416 "utf16", errmsg,
5417 &starts,
5418 (const char **)&e,
5419 &startinpos,
5420 &endinpos,
5421 &exc,
5422 (const char **)&q,
5423 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005424 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005427 /* remaining byte at the end? (size should be even) */
5428 if (e == q) {
5429 if (!consumed) {
5430 errmsg = "truncated data";
5431 startinpos = ((const char *)q) - starts;
5432 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005433 if (unicode_decode_call_errorhandler(
5434 errors,
5435 &errorHandler,
5436 "utf16", errmsg,
5437 &starts,
5438 (const char **)&e,
5439 &startinpos,
5440 &endinpos,
5441 &exc,
5442 (const char **)&q,
5443 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005444 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005445 goto onError;
5446 /* The remaining input chars are ignored if the callback
5447 chooses to skip the input */
5448 }
5449 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450
5451 if (byteorder)
5452 *byteorder = bo;
5453
Walter Dörwald69652032004-09-07 20:24:22 +00005454 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005456
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005458 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 goto onError;
5460
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005461 Py_XDECREF(errorHandler);
5462 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005463 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005467 Py_XDECREF(errorHandler);
5468 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 return NULL;
5470}
5471
Antoine Pitrouab868312009-01-10 15:40:25 +00005472#undef FAST_CHAR_MASK
5473#undef SWAPPED_FAST_CHAR_MASK
5474
Tim Peters772747b2001-08-09 22:21:55 +00005475PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005476_PyUnicode_EncodeUTF16(PyObject *str,
5477 const char *errors,
5478 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005480 int kind;
5481 void *data;
5482 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005483 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005484 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005485 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005486 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005487 /* Offsets from p for storing byte pairs in the right order. */
5488#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5489 int ihi = 1, ilo = 0;
5490#else
5491 int ihi = 0, ilo = 1;
5492#endif
5493
Benjamin Peterson29060642009-01-31 22:14:21 +00005494#define STORECHAR(CH) \
5495 do { \
5496 p[ihi] = ((CH) >> 8) & 0xff; \
5497 p[ilo] = (CH) & 0xff; \
5498 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005499 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005501 if (!PyUnicode_Check(str)) {
5502 PyErr_BadArgument();
5503 return NULL;
5504 }
5505 if (PyUnicode_READY(str) < 0)
5506 return NULL;
5507 kind = PyUnicode_KIND(str);
5508 data = PyUnicode_DATA(str);
5509 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005510
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005511 pairs = 0;
5512 if (kind == PyUnicode_4BYTE_KIND)
5513 for (i = 0; i < len; i++)
5514 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5515 pairs++;
5516 /* 2 * (len + pairs + (byteorder == 0)) */
5517 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005518 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005519 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005520 bytesize = nsize * 2;
5521 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005522 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005523 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 if (v == NULL)
5525 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005526
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005527 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005530 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005531 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005532
5533 if (byteorder == -1) {
5534 /* force LE */
5535 ihi = 1;
5536 ilo = 0;
5537 }
5538 else if (byteorder == 1) {
5539 /* force BE */
5540 ihi = 0;
5541 ilo = 1;
5542 }
5543
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005544 for (i = 0; i < len; i++) {
5545 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5546 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 if (ch >= 0x10000) {
5548 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5549 ch = 0xD800 | ((ch-0x10000) >> 10);
5550 }
Tim Peters772747b2001-08-09 22:21:55 +00005551 STORECHAR(ch);
5552 if (ch2)
5553 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005554 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005555
5556 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005557 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005558#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559}
5560
Alexander Belopolsky40018472011-02-26 01:02:56 +00005561PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005562PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5563 Py_ssize_t size,
5564 const char *errors,
5565 int byteorder)
5566{
5567 PyObject *result;
5568 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5569 if (tmp == NULL)
5570 return NULL;
5571 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5572 Py_DECREF(tmp);
5573 return result;
5574}
5575
5576PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005577PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005579 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580}
5581
5582/* --- Unicode Escape Codec ----------------------------------------------- */
5583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005584/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5585 if all the escapes in the string make it still a valid ASCII string.
5586 Returns -1 if any escapes were found which cause the string to
5587 pop out of ASCII range. Otherwise returns the length of the
5588 required buffer to hold the string.
5589 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005590static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005591length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5592{
5593 const unsigned char *p = (const unsigned char *)s;
5594 const unsigned char *end = p + size;
5595 Py_ssize_t length = 0;
5596
5597 if (size < 0)
5598 return -1;
5599
5600 for (; p < end; ++p) {
5601 if (*p > 127) {
5602 /* Non-ASCII */
5603 return -1;
5604 }
5605 else if (*p != '\\') {
5606 /* Normal character */
5607 ++length;
5608 }
5609 else {
5610 /* Backslash-escape, check next char */
5611 ++p;
5612 /* Escape sequence reaches till end of string or
5613 non-ASCII follow-up. */
5614 if (p >= end || *p > 127)
5615 return -1;
5616 switch (*p) {
5617 case '\n':
5618 /* backslash + \n result in zero characters */
5619 break;
5620 case '\\': case '\'': case '\"':
5621 case 'b': case 'f': case 't':
5622 case 'n': case 'r': case 'v': case 'a':
5623 ++length;
5624 break;
5625 case '0': case '1': case '2': case '3':
5626 case '4': case '5': case '6': case '7':
5627 case 'x': case 'u': case 'U': case 'N':
5628 /* these do not guarantee ASCII characters */
5629 return -1;
5630 default:
5631 /* count the backslash + the other character */
5632 length += 2;
5633 }
5634 }
5635 }
5636 return length;
5637}
5638
Fredrik Lundh06d12682001-01-24 07:59:11 +00005639static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005640
Alexander Belopolsky40018472011-02-26 01:02:56 +00005641PyObject *
5642PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005643 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005644 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005646 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005647 Py_ssize_t startinpos;
5648 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005649 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005650 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005652 char* message;
5653 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654 PyObject *errorHandler = NULL;
5655 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005656 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005657 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005658
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005659 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005660
5661 /* After length_of_escaped_ascii_string() there are two alternatives,
5662 either the string is pure ASCII with named escapes like \n, etc.
5663 and we determined it's exact size (common case)
5664 or it contains \x, \u, ... escape sequences. then we create a
5665 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005666 if (len >= 0) {
5667 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005668 if (!v)
5669 goto onError;
5670 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005671 }
5672 else {
5673 /* Escaped strings will always be longer than the resulting
5674 Unicode string, so we start with size here and then reduce the
5675 length after conversion to the true value.
5676 (but if the error callback returns a long replacement string
5677 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005678 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679 if (!v)
5680 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005681 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005682 }
5683
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005685 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005686 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 while (s < end) {
5690 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005691 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005694 /* The only case in which i == ascii_length is a backslash
5695 followed by a newline. */
5696 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005697
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 /* Non-escape characters are interpreted as Unicode ordinals */
5699 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005700 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5701 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 continue;
5703 }
5704
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 /* \ - Escapes */
5707 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005708 c = *s++;
5709 if (s > end)
5710 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005711
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005712 /* The only case in which i == ascii_length is a backslash
5713 followed by a newline. */
5714 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005715
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005716 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005719#define WRITECHAR(ch) \
5720 do { \
5721 if (unicode_putchar(&v, &i, ch) < 0) \
5722 goto onError; \
5723 }while(0)
5724
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005726 case '\\': WRITECHAR('\\'); break;
5727 case '\'': WRITECHAR('\''); break;
5728 case '\"': WRITECHAR('\"'); break;
5729 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005730 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005731 case 'f': WRITECHAR('\014'); break;
5732 case 't': WRITECHAR('\t'); break;
5733 case 'n': WRITECHAR('\n'); break;
5734 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005735 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005736 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005737 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005738 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 case '0': case '1': case '2': case '3':
5742 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005743 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005744 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005745 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005746 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005747 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005749 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 break;
5751
Benjamin Peterson29060642009-01-31 22:14:21 +00005752 /* hex escapes */
5753 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005755 digits = 2;
5756 message = "truncated \\xXX escape";
5757 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005761 digits = 4;
5762 message = "truncated \\uXXXX escape";
5763 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005766 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005767 digits = 8;
5768 message = "truncated \\UXXXXXXXX escape";
5769 hexescape:
5770 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005771 if (s+digits>end) {
5772 endinpos = size;
5773 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 errors, &errorHandler,
5775 "unicodeescape", "end of string in escape sequence",
5776 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005777 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 goto onError;
5779 goto nextByte;
5780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 for (j = 0; j < digits; ++j) {
5782 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005783 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005784 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005785 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 errors, &errorHandler,
5787 "unicodeescape", message,
5788 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005789 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005790 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005791 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005793 }
5794 chr = (chr<<4) & ~0xF;
5795 if (c >= '0' && c <= '9')
5796 chr += c - '0';
5797 else if (c >= 'a' && c <= 'f')
5798 chr += 10 + c - 'a';
5799 else
5800 chr += 10 + c - 'A';
5801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005802 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005803 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 /* _decoding_error will have already written into the
5805 target buffer. */
5806 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005807 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005808 /* when we get here, chr is a 32-bit unicode character */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005809 if (chr <= 0x10ffff) {
5810 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005811 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005812 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005813 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005814 errors, &errorHandler,
5815 "unicodeescape", "illegal Unicode character",
5816 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005817 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005818 goto onError;
5819 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005820 break;
5821
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005823 case 'N':
5824 message = "malformed \\N character escape";
5825 if (ucnhash_CAPI == NULL) {
5826 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005827 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5828 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005829 if (ucnhash_CAPI == NULL)
5830 goto ucnhashError;
5831 }
5832 if (*s == '{') {
5833 const char *start = s+1;
5834 /* look for the closing brace */
5835 while (*s != '}' && s < end)
5836 s++;
5837 if (s > start && s < end && *s == '}') {
5838 /* found a name. look it up in the unicode database */
5839 message = "unknown Unicode character name";
5840 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005841 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005842 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005843 goto store;
5844 }
5845 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005846 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005847 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 errors, &errorHandler,
5849 "unicodeescape", message,
5850 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005851 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005852 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005853 break;
5854
5855 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005856 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005857 message = "\\ at end of string";
5858 s--;
5859 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005860 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 errors, &errorHandler,
5862 "unicodeescape", message,
5863 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005864 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005865 goto onError;
5866 }
5867 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005868 WRITECHAR('\\');
5869 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005870 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005871 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005872 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005873 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005876#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005877
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005878 if (PyUnicode_Resize(&v, i) < 0)
5879 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005880 Py_XDECREF(errorHandler);
5881 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005882 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005883
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005885 PyErr_SetString(
5886 PyExc_UnicodeError,
5887 "\\N escapes not supported (can't load unicodedata module)"
5888 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005889 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890 Py_XDECREF(errorHandler);
5891 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005892 return NULL;
5893
Benjamin Peterson29060642009-01-31 22:14:21 +00005894 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005896 Py_XDECREF(errorHandler);
5897 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 return NULL;
5899}
5900
5901/* Return a Unicode-Escape string version of the Unicode object.
5902
5903 If quotes is true, the string is enclosed in u"" or u'' quotes as
5904 appropriate.
5905
5906*/
5907
Alexander Belopolsky40018472011-02-26 01:02:56 +00005908PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005909PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005912 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914 int kind;
5915 void *data;
5916 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917
Thomas Wouters89f507f2006-12-13 04:49:30 +00005918 /* Initial allocation is based on the longest-possible unichr
5919 escape.
5920
5921 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5922 unichr, so in this case it's the longest unichr escape. In
5923 narrow (UTF-16) builds this is five chars per source unichr
5924 since there are two unichrs in the surrogate pair, so in narrow
5925 (UTF-16) builds it's not the longest unichr escape.
5926
5927 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5928 so in the narrow (UTF-16) build case it's the longest unichr
5929 escape.
5930 */
5931
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005932 if (!PyUnicode_Check(unicode)) {
5933 PyErr_BadArgument();
5934 return NULL;
5935 }
5936 if (PyUnicode_READY(unicode) < 0)
5937 return NULL;
5938 len = PyUnicode_GET_LENGTH(unicode);
5939 kind = PyUnicode_KIND(unicode);
5940 data = PyUnicode_DATA(unicode);
5941 switch(kind) {
5942 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5943 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5944 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5945 }
5946
5947 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005948 return PyBytes_FromStringAndSize(NULL, 0);
5949
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005950 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005952
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005953 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005955 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 if (repr == NULL)
5958 return NULL;
5959
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005960 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005962 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005963 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005964
Walter Dörwald79e913e2007-05-12 11:08:06 +00005965 /* Escape backslashes */
5966 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967 *p++ = '\\';
5968 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005969 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005970 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005971
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005972 /* Map 21-bit characters to '\U00xxxxxx' */
5973 else if (ch >= 0x10000) {
5974 *p++ = '\\';
5975 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005976 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5977 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5978 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5979 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5980 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5981 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5982 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5983 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005985 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005986
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005988 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 *p++ = '\\';
5990 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005991 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5992 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5993 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5994 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005996
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005997 /* Map special whitespace to '\t', \n', '\r' */
5998 else if (ch == '\t') {
5999 *p++ = '\\';
6000 *p++ = 't';
6001 }
6002 else if (ch == '\n') {
6003 *p++ = '\\';
6004 *p++ = 'n';
6005 }
6006 else if (ch == '\r') {
6007 *p++ = '\\';
6008 *p++ = 'r';
6009 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006010
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006011 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006012 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006014 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006015 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6016 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006017 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006018
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 /* Copy everything else as-is */
6020 else
6021 *p++ = (char) ch;
6022 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006024 assert(p - PyBytes_AS_STRING(repr) > 0);
6025 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6026 return NULL;
6027 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028}
6029
Alexander Belopolsky40018472011-02-26 01:02:56 +00006030PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006031PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6032 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 PyObject *result;
6035 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6036 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006038 result = PyUnicode_AsUnicodeEscapeString(tmp);
6039 Py_DECREF(tmp);
6040 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041}
6042
6043/* --- Raw Unicode Escape Codec ------------------------------------------- */
6044
Alexander Belopolsky40018472011-02-26 01:02:56 +00006045PyObject *
6046PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006047 Py_ssize_t size,
6048 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006050 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006051 Py_ssize_t startinpos;
6052 Py_ssize_t endinpos;
6053 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006054 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 const char *end;
6056 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006057 PyObject *errorHandler = NULL;
6058 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006059
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 /* Escaped strings will always be longer than the resulting
6061 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006062 length after conversion to the true value. (But decoding error
6063 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006064 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006068 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006069 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 end = s + size;
6071 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 unsigned char c;
6073 Py_UCS4 x;
6074 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006075 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 /* Non-escape characters are interpreted as Unicode ordinals */
6078 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006079 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6080 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006081 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006082 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006083 startinpos = s-starts;
6084
6085 /* \u-escapes are only interpreted iff the number of leading
6086 backslashes if odd */
6087 bs = s;
6088 for (;s < end;) {
6089 if (*s != '\\')
6090 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006091 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6092 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 }
6094 if (((s - bs) & 1) == 0 ||
6095 s >= end ||
6096 (*s != 'u' && *s != 'U')) {
6097 continue;
6098 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006099 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 count = *s=='u' ? 4 : 8;
6101 s++;
6102
6103 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006104 for (x = 0, i = 0; i < count; ++i, ++s) {
6105 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006106 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006107 endinpos = s-starts;
6108 if (unicode_decode_call_errorhandler(
6109 errors, &errorHandler,
6110 "rawunicodeescape", "truncated \\uXXXX",
6111 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006112 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 goto onError;
6114 goto nextByte;
6115 }
6116 x = (x<<4) & ~0xF;
6117 if (c >= '0' && c <= '9')
6118 x += c - '0';
6119 else if (c >= 'a' && c <= 'f')
6120 x += 10 + c - 'a';
6121 else
6122 x += 10 + c - 'A';
6123 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006124 if (x <= 0x10ffff) {
6125 if (unicode_putchar(&v, &outpos, x) < 0)
6126 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006127 } else {
6128 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006129 if (unicode_decode_call_errorhandler(
6130 errors, &errorHandler,
6131 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006133 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006135 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 nextByte:
6137 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006139 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006143 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006144
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006147 Py_XDECREF(errorHandler);
6148 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 return NULL;
6150}
6151
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152
Alexander Belopolsky40018472011-02-26 01:02:56 +00006153PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006154PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006156 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 char *p;
6158 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 Py_ssize_t expandsize, pos;
6160 int kind;
6161 void *data;
6162 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006164 if (!PyUnicode_Check(unicode)) {
6165 PyErr_BadArgument();
6166 return NULL;
6167 }
6168 if (PyUnicode_READY(unicode) < 0)
6169 return NULL;
6170 kind = PyUnicode_KIND(unicode);
6171 data = PyUnicode_DATA(unicode);
6172 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006173
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006174 switch(kind) {
6175 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6176 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6177 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6178 }
Victor Stinner0e368262011-11-10 20:12:49 +01006179
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006182
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 if (repr == NULL)
6185 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006187 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006189 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006190 for (pos = 0; pos < len; pos++) {
6191 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006192 /* Map 32-bit characters to '\Uxxxxxxxx' */
6193 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006194 *p++ = '\\';
6195 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006196 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6197 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6198 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6199 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6200 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6201 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6202 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6203 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006204 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006206 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207 *p++ = '\\';
6208 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006209 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6210 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6211 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6212 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 /* Copy everything else as-is */
6215 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 *p++ = (char) ch;
6217 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006218
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006219 assert(p > q);
6220 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006221 return NULL;
6222 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223}
6224
Alexander Belopolsky40018472011-02-26 01:02:56 +00006225PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6227 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006229 PyObject *result;
6230 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6231 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006232 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006233 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6234 Py_DECREF(tmp);
6235 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236}
6237
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006238/* --- Unicode Internal Codec ------------------------------------------- */
6239
Alexander Belopolsky40018472011-02-26 01:02:56 +00006240PyObject *
6241_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006242 Py_ssize_t size,
6243 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006244{
6245 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006246 Py_ssize_t startinpos;
6247 Py_ssize_t endinpos;
6248 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006249 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006250 const char *end;
6251 const char *reason;
6252 PyObject *errorHandler = NULL;
6253 PyObject *exc = NULL;
6254
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006255 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006256 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006257 1))
6258 return NULL;
6259
Thomas Wouters89f507f2006-12-13 04:49:30 +00006260 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006261 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006262 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006264 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006265 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006266 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006267 end = s + size;
6268
6269 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006270 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006271 Py_UCS4 ch;
6272 /* We copy the raw representation one byte at a time because the
6273 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006274 ((char *) &uch)[0] = s[0];
6275 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006276#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006277 ((char *) &uch)[2] = s[2];
6278 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006279#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006280 ch = uch;
6281
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006282 /* We have to sanity check the raw data, otherwise doom looms for
6283 some malformed UCS-4 data. */
6284 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006285#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006286 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006287#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006288 end-s < Py_UNICODE_SIZE
6289 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006291 startinpos = s - starts;
6292 if (end-s < Py_UNICODE_SIZE) {
6293 endinpos = end-starts;
6294 reason = "truncated input";
6295 }
6296 else {
6297 endinpos = s - starts + Py_UNICODE_SIZE;
6298 reason = "illegal code point (> 0x10FFFF)";
6299 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006300 if (unicode_decode_call_errorhandler(
6301 errors, &errorHandler,
6302 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006303 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006304 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006305 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006306 continue;
6307 }
6308
6309 s += Py_UNICODE_SIZE;
6310#ifndef Py_UNICODE_WIDE
6311 if (ch >= 0xD800 && ch <= 0xDBFF && s < end)
6312 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006313 Py_UNICODE uch2;
6314 ((char *) &uch2)[0] = s[0];
6315 ((char *) &uch2)[1] = s[1];
6316 if (uch2 >= 0xDC00 && uch2 <= 0xDFFF)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006317 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006318 ch = (((uch & 0x3FF)<<10) | (uch2 & 0x3FF)) + 0x10000;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006319 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006320 }
6321 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006322#endif
6323
6324 if (unicode_putchar(&v, &outpos, ch) < 0)
6325 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006326 }
6327
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006328 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329 goto onError;
6330 Py_XDECREF(errorHandler);
6331 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006332 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006333
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006335 Py_XDECREF(v);
6336 Py_XDECREF(errorHandler);
6337 Py_XDECREF(exc);
6338 return NULL;
6339}
6340
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341/* --- Latin-1 Codec ------------------------------------------------------ */
6342
Alexander Belopolsky40018472011-02-26 01:02:56 +00006343PyObject *
6344PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006345 Py_ssize_t size,
6346 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006349 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350}
6351
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006353static void
6354make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006355 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006356 PyObject *unicode,
6357 Py_ssize_t startpos, Py_ssize_t endpos,
6358 const char *reason)
6359{
6360 if (*exceptionObject == NULL) {
6361 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006362 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006363 encoding, unicode, startpos, endpos, reason);
6364 }
6365 else {
6366 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6367 goto onError;
6368 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6369 goto onError;
6370 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6371 goto onError;
6372 return;
6373 onError:
6374 Py_DECREF(*exceptionObject);
6375 *exceptionObject = NULL;
6376 }
6377}
6378
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006380static void
6381raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006382 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006383 PyObject *unicode,
6384 Py_ssize_t startpos, Py_ssize_t endpos,
6385 const char *reason)
6386{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006387 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006388 encoding, unicode, startpos, endpos, reason);
6389 if (*exceptionObject != NULL)
6390 PyCodec_StrictErrors(*exceptionObject);
6391}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392
6393/* error handling callback helper:
6394 build arguments, call the callback and check the arguments,
6395 put the result into newpos and return the replacement string, which
6396 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006397static PyObject *
6398unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006399 PyObject **errorHandler,
6400 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006401 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006402 Py_ssize_t startpos, Py_ssize_t endpos,
6403 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006405 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006406 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 PyObject *restuple;
6408 PyObject *resunicode;
6409
6410 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006412 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 }
6415
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 if (PyUnicode_READY(unicode) < 0)
6417 return NULL;
6418 len = PyUnicode_GET_LENGTH(unicode);
6419
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006420 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006421 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424
6425 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006426 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006427 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006429 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006430 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 Py_DECREF(restuple);
6432 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006434 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 &resunicode, newpos)) {
6436 Py_DECREF(restuple);
6437 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006439 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6440 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6441 Py_DECREF(restuple);
6442 return NULL;
6443 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006445 *newpos = len + *newpos;
6446 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6448 Py_DECREF(restuple);
6449 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006450 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 Py_INCREF(resunicode);
6452 Py_DECREF(restuple);
6453 return resunicode;
6454}
6455
Alexander Belopolsky40018472011-02-26 01:02:56 +00006456static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006457unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006458 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006459 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006460{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006461 /* input state */
6462 Py_ssize_t pos=0, size;
6463 int kind;
6464 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 /* output object */
6466 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006467 /* pointer into the output */
6468 char *str;
6469 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006470 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006471 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6472 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006473 PyObject *errorHandler = NULL;
6474 PyObject *exc = NULL;
6475 /* the following variable is used for caching string comparisons
6476 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6477 int known_errorHandler = -1;
6478
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 if (PyUnicode_READY(unicode) < 0)
6480 return NULL;
6481 size = PyUnicode_GET_LENGTH(unicode);
6482 kind = PyUnicode_KIND(unicode);
6483 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484 /* allocate enough for a simple encoding without
6485 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006486 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006487 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006488 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006489 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006490 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006491 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006492 ressize = size;
6493
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006494 while (pos < size) {
6495 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006496
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 /* can we encode this? */
6498 if (c<limit) {
6499 /* no overflow check, because we know that the space is enough */
6500 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006502 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 Py_ssize_t requiredsize;
6505 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006508 Py_ssize_t collstart = pos;
6509 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006511 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 ++collend;
6513 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6514 if (known_errorHandler==-1) {
6515 if ((errors==NULL) || (!strcmp(errors, "strict")))
6516 known_errorHandler = 1;
6517 else if (!strcmp(errors, "replace"))
6518 known_errorHandler = 2;
6519 else if (!strcmp(errors, "ignore"))
6520 known_errorHandler = 3;
6521 else if (!strcmp(errors, "xmlcharrefreplace"))
6522 known_errorHandler = 4;
6523 else
6524 known_errorHandler = 0;
6525 }
6526 switch (known_errorHandler) {
6527 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006528 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 goto onError;
6530 case 2: /* replace */
6531 while (collstart++<collend)
6532 *str++ = '?'; /* fall through */
6533 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006534 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 break;
6536 case 4: /* xmlcharrefreplace */
6537 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006538 /* determine replacement size */
6539 for (i = collstart, repsize = 0; i < collend; ++i) {
6540 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6541 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006545 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006546 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006547 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006549#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 else
6551 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006552#else
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006553 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006555 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 repsize += 2+6+1;
6557 else
6558 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006559#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006561 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 if (requiredsize > ressize) {
6563 if (requiredsize<2*ressize)
6564 requiredsize = 2*ressize;
6565 if (_PyBytes_Resize(&res, requiredsize))
6566 goto onError;
6567 str = PyBytes_AS_STRING(res) + respos;
6568 ressize = requiredsize;
6569 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006570 /* generate replacement */
6571 for (i = collstart; i < collend; ++i) {
6572 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006574 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 break;
6576 default:
6577 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006578 encoding, reason, unicode, &exc,
6579 collstart, collend, &newpos);
6580 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6581 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006583 if (PyBytes_Check(repunicode)) {
6584 /* Directly copy bytes result to output. */
6585 repsize = PyBytes_Size(repunicode);
6586 if (repsize > 1) {
6587 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006588 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006589 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6590 Py_DECREF(repunicode);
6591 goto onError;
6592 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006593 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006594 ressize += repsize-1;
6595 }
6596 memcpy(str, PyBytes_AsString(repunicode), repsize);
6597 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006598 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006599 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006600 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006601 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 /* need more space? (at least enough for what we
6603 have+the replacement+the rest of the string, so
6604 we won't have to check space for encodable characters) */
6605 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606 repsize = PyUnicode_GET_LENGTH(repunicode);
6607 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 if (requiredsize > ressize) {
6609 if (requiredsize<2*ressize)
6610 requiredsize = 2*ressize;
6611 if (_PyBytes_Resize(&res, requiredsize)) {
6612 Py_DECREF(repunicode);
6613 goto onError;
6614 }
6615 str = PyBytes_AS_STRING(res) + respos;
6616 ressize = requiredsize;
6617 }
6618 /* check if there is anything unencodable in the replacement
6619 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 for (i = 0; repsize-->0; ++i, ++str) {
6621 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006623 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 Py_DECREF(repunicode);
6626 goto onError;
6627 }
6628 *str = (char)c;
6629 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006630 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006631 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006632 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006633 }
6634 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006635 /* Resize if we allocated to much */
6636 size = str - PyBytes_AS_STRING(res);
6637 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006638 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006639 if (_PyBytes_Resize(&res, size) < 0)
6640 goto onError;
6641 }
6642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006643 Py_XDECREF(errorHandler);
6644 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006645 return res;
6646
6647 onError:
6648 Py_XDECREF(res);
6649 Py_XDECREF(errorHandler);
6650 Py_XDECREF(exc);
6651 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006652}
6653
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006654/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006655PyObject *
6656PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006657 Py_ssize_t size,
6658 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006660 PyObject *result;
6661 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6662 if (unicode == NULL)
6663 return NULL;
6664 result = unicode_encode_ucs1(unicode, errors, 256);
6665 Py_DECREF(unicode);
6666 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667}
6668
Alexander Belopolsky40018472011-02-26 01:02:56 +00006669PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006670_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671{
6672 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 PyErr_BadArgument();
6674 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006676 if (PyUnicode_READY(unicode) == -1)
6677 return NULL;
6678 /* Fast path: if it is a one-byte string, construct
6679 bytes object directly. */
6680 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6681 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6682 PyUnicode_GET_LENGTH(unicode));
6683 /* Non-Latin-1 characters present. Defer to above function to
6684 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006685 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006686}
6687
6688PyObject*
6689PyUnicode_AsLatin1String(PyObject *unicode)
6690{
6691 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006692}
6693
6694/* --- 7-bit ASCII Codec -------------------------------------------------- */
6695
Alexander Belopolsky40018472011-02-26 01:02:56 +00006696PyObject *
6697PyUnicode_DecodeASCII(const char *s,
6698 Py_ssize_t size,
6699 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006700{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006702 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006703 int kind;
6704 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006705 Py_ssize_t startinpos;
6706 Py_ssize_t endinpos;
6707 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006709 int has_error;
6710 const unsigned char *p = (const unsigned char *)s;
6711 const unsigned char *end = p + size;
6712 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 PyObject *errorHandler = NULL;
6714 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006715
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006716 if (size == 0) {
6717 Py_INCREF(unicode_empty);
6718 return unicode_empty;
6719 }
6720
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006722 if (size == 1 && (unsigned char)s[0] < 128)
6723 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006724
Victor Stinner702c7342011-10-05 13:50:52 +02006725 has_error = 0;
6726 while (p < end && !has_error) {
6727 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6728 an explanation. */
6729 if (!((size_t) p & LONG_PTR_MASK)) {
6730 /* Help register allocation */
6731 register const unsigned char *_p = p;
6732 while (_p < aligned_end) {
6733 unsigned long value = *(unsigned long *) _p;
6734 if (value & ASCII_CHAR_MASK) {
6735 has_error = 1;
6736 break;
6737 }
6738 _p += SIZEOF_LONG;
6739 }
6740 if (_p == end)
6741 break;
6742 if (has_error)
6743 break;
6744 p = _p;
6745 }
6746 if (*p & 0x80) {
6747 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006748 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006749 }
6750 else {
6751 ++p;
6752 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006753 }
Victor Stinner702c7342011-10-05 13:50:52 +02006754 if (!has_error)
6755 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006756
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006757 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006759 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006761 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006762 kind = PyUnicode_KIND(v);
6763 data = PyUnicode_DATA(v);
6764 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006765 e = s + size;
6766 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 register unsigned char c = (unsigned char)*s;
6768 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006769 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 ++s;
6771 }
6772 else {
6773 startinpos = s-starts;
6774 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 if (unicode_decode_call_errorhandler(
6776 errors, &errorHandler,
6777 "ascii", "ordinal not in range(128)",
6778 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006779 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006781 kind = PyUnicode_KIND(v);
6782 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006785 if (PyUnicode_Resize(&v, outpos) < 0)
6786 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006787 Py_XDECREF(errorHandler);
6788 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006789 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006790 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006791
Benjamin Peterson29060642009-01-31 22:14:21 +00006792 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006793 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006794 Py_XDECREF(errorHandler);
6795 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006796 return NULL;
6797}
6798
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006799/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006800PyObject *
6801PyUnicode_EncodeASCII(const Py_UNICODE *p,
6802 Py_ssize_t size,
6803 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006804{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006805 PyObject *result;
6806 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6807 if (unicode == NULL)
6808 return NULL;
6809 result = unicode_encode_ucs1(unicode, errors, 128);
6810 Py_DECREF(unicode);
6811 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812}
6813
Alexander Belopolsky40018472011-02-26 01:02:56 +00006814PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006815_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816{
6817 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 PyErr_BadArgument();
6819 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006821 if (PyUnicode_READY(unicode) == -1)
6822 return NULL;
6823 /* Fast path: if it is an ASCII-only string, construct bytes object
6824 directly. Else defer to above function to raise the exception. */
6825 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6826 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6827 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006828 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006829}
6830
6831PyObject *
6832PyUnicode_AsASCIIString(PyObject *unicode)
6833{
6834 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835}
6836
Victor Stinner99b95382011-07-04 14:23:54 +02006837#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006838
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006839/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006840
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006841#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006842#define NEED_RETRY
6843#endif
6844
Victor Stinner3a50e702011-10-18 21:21:00 +02006845#ifndef WC_ERR_INVALID_CHARS
6846# define WC_ERR_INVALID_CHARS 0x0080
6847#endif
6848
6849static char*
6850code_page_name(UINT code_page, PyObject **obj)
6851{
6852 *obj = NULL;
6853 if (code_page == CP_ACP)
6854 return "mbcs";
6855 if (code_page == CP_UTF7)
6856 return "CP_UTF7";
6857 if (code_page == CP_UTF8)
6858 return "CP_UTF8";
6859
6860 *obj = PyBytes_FromFormat("cp%u", code_page);
6861 if (*obj == NULL)
6862 return NULL;
6863 return PyBytes_AS_STRING(*obj);
6864}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006865
Alexander Belopolsky40018472011-02-26 01:02:56 +00006866static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006867is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868{
6869 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006870 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871
Victor Stinner3a50e702011-10-18 21:21:00 +02006872 if (!IsDBCSLeadByteEx(code_page, *curr))
6873 return 0;
6874
6875 prev = CharPrevExA(code_page, s, curr, 0);
6876 if (prev == curr)
6877 return 1;
6878 /* FIXME: This code is limited to "true" double-byte encodings,
6879 as it assumes an incomplete character consists of a single
6880 byte. */
6881 if (curr - prev == 2)
6882 return 1;
6883 if (!IsDBCSLeadByteEx(code_page, *prev))
6884 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885 return 0;
6886}
6887
Victor Stinner3a50e702011-10-18 21:21:00 +02006888static DWORD
6889decode_code_page_flags(UINT code_page)
6890{
6891 if (code_page == CP_UTF7) {
6892 /* The CP_UTF7 decoder only supports flags=0 */
6893 return 0;
6894 }
6895 else
6896 return MB_ERR_INVALID_CHARS;
6897}
6898
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006899/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006900 * Decode a byte string from a Windows code page into unicode object in strict
6901 * mode.
6902 *
6903 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6904 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006906static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006907decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006908 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006909 const char *in,
6910 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006911{
Victor Stinner3a50e702011-10-18 21:21:00 +02006912 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006913 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006914 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915
6916 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006917 assert(insize > 0);
6918 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6919 if (outsize <= 0)
6920 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006921
6922 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006923 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006924 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 if (*v == NULL)
6926 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006927 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006928 }
6929 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006930 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006931 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006932 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006934 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006935 }
6936
6937 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006938 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6939 if (outsize <= 0)
6940 goto error;
6941 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006942
Victor Stinner3a50e702011-10-18 21:21:00 +02006943error:
6944 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6945 return -2;
6946 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006947 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948}
6949
Victor Stinner3a50e702011-10-18 21:21:00 +02006950/*
6951 * Decode a byte string from a code page into unicode object with an error
6952 * handler.
6953 *
6954 * Returns consumed size if succeed, or raise a WindowsError or
6955 * UnicodeDecodeError exception and returns -1 on error.
6956 */
6957static int
6958decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006959 PyObject **v,
6960 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006961 const char *errors)
6962{
6963 const char *startin = in;
6964 const char *endin = in + size;
6965 const DWORD flags = decode_code_page_flags(code_page);
6966 /* Ideally, we should get reason from FormatMessage. This is the Windows
6967 2000 English version of the message. */
6968 const char *reason = "No mapping for the Unicode character exists "
6969 "in the target code page.";
6970 /* each step cannot decode more than 1 character, but a character can be
6971 represented as a surrogate pair */
6972 wchar_t buffer[2], *startout, *out;
6973 int insize, outsize;
6974 PyObject *errorHandler = NULL;
6975 PyObject *exc = NULL;
6976 PyObject *encoding_obj = NULL;
6977 char *encoding;
6978 DWORD err;
6979 int ret = -1;
6980
6981 assert(size > 0);
6982
6983 encoding = code_page_name(code_page, &encoding_obj);
6984 if (encoding == NULL)
6985 return -1;
6986
6987 if (errors == NULL || strcmp(errors, "strict") == 0) {
6988 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6989 UnicodeDecodeError. */
6990 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6991 if (exc != NULL) {
6992 PyCodec_StrictErrors(exc);
6993 Py_CLEAR(exc);
6994 }
6995 goto error;
6996 }
6997
6998 if (*v == NULL) {
6999 /* Create unicode object */
7000 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7001 PyErr_NoMemory();
7002 goto error;
7003 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007004 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007005 if (*v == NULL)
7006 goto error;
7007 startout = PyUnicode_AS_UNICODE(*v);
7008 }
7009 else {
7010 /* Extend unicode object */
7011 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7012 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7013 PyErr_NoMemory();
7014 goto error;
7015 }
Victor Stinner76a31a62011-11-04 00:05:13 +01007016 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007017 goto error;
7018 startout = PyUnicode_AS_UNICODE(*v) + n;
7019 }
7020
7021 /* Decode the byte string character per character */
7022 out = startout;
7023 while (in < endin)
7024 {
7025 /* Decode a character */
7026 insize = 1;
7027 do
7028 {
7029 outsize = MultiByteToWideChar(code_page, flags,
7030 in, insize,
7031 buffer, Py_ARRAY_LENGTH(buffer));
7032 if (outsize > 0)
7033 break;
7034 err = GetLastError();
7035 if (err != ERROR_NO_UNICODE_TRANSLATION
7036 && err != ERROR_INSUFFICIENT_BUFFER)
7037 {
7038 PyErr_SetFromWindowsErr(0);
7039 goto error;
7040 }
7041 insize++;
7042 }
7043 /* 4=maximum length of a UTF-8 sequence */
7044 while (insize <= 4 && (in + insize) <= endin);
7045
7046 if (outsize <= 0) {
7047 Py_ssize_t startinpos, endinpos, outpos;
7048
7049 startinpos = in - startin;
7050 endinpos = startinpos + 1;
7051 outpos = out - PyUnicode_AS_UNICODE(*v);
7052 if (unicode_decode_call_errorhandler(
7053 errors, &errorHandler,
7054 encoding, reason,
7055 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007056 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007057 {
7058 goto error;
7059 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007060 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007061 }
7062 else {
7063 in += insize;
7064 memcpy(out, buffer, outsize * sizeof(wchar_t));
7065 out += outsize;
7066 }
7067 }
7068
7069 /* write a NUL character at the end */
7070 *out = 0;
7071
7072 /* Extend unicode object */
7073 outsize = out - startout;
7074 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007075 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007076 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007077 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007078
7079error:
7080 Py_XDECREF(encoding_obj);
7081 Py_XDECREF(errorHandler);
7082 Py_XDECREF(exc);
7083 return ret;
7084}
7085
Victor Stinner3a50e702011-10-18 21:21:00 +02007086static PyObject *
7087decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007088 const char *s, Py_ssize_t size,
7089 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090{
Victor Stinner76a31a62011-11-04 00:05:13 +01007091 PyObject *v = NULL;
7092 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007093
Victor Stinner3a50e702011-10-18 21:21:00 +02007094 if (code_page < 0) {
7095 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7096 return NULL;
7097 }
7098
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007101
Victor Stinner76a31a62011-11-04 00:05:13 +01007102 do
7103 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007105 if (size > INT_MAX) {
7106 chunk_size = INT_MAX;
7107 final = 0;
7108 done = 0;
7109 }
7110 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007112 {
7113 chunk_size = (int)size;
7114 final = (consumed == NULL);
7115 done = 1;
7116 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117
Victor Stinner76a31a62011-11-04 00:05:13 +01007118 /* Skip trailing lead-byte unless 'final' is set */
7119 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7120 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007121
Victor Stinner76a31a62011-11-04 00:05:13 +01007122 if (chunk_size == 0 && done) {
7123 if (v != NULL)
7124 break;
7125 Py_INCREF(unicode_empty);
7126 return unicode_empty;
7127 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007128
Victor Stinner76a31a62011-11-04 00:05:13 +01007129
7130 converted = decode_code_page_strict(code_page, &v,
7131 s, chunk_size);
7132 if (converted == -2)
7133 converted = decode_code_page_errors(code_page, &v,
7134 s, chunk_size,
7135 errors);
7136 assert(converted != 0);
7137
7138 if (converted < 0) {
7139 Py_XDECREF(v);
7140 return NULL;
7141 }
7142
7143 if (consumed)
7144 *consumed += converted;
7145
7146 s += converted;
7147 size -= converted;
7148 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007149
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007150 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007151}
7152
Alexander Belopolsky40018472011-02-26 01:02:56 +00007153PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007154PyUnicode_DecodeCodePageStateful(int code_page,
7155 const char *s,
7156 Py_ssize_t size,
7157 const char *errors,
7158 Py_ssize_t *consumed)
7159{
7160 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7161}
7162
7163PyObject *
7164PyUnicode_DecodeMBCSStateful(const char *s,
7165 Py_ssize_t size,
7166 const char *errors,
7167 Py_ssize_t *consumed)
7168{
7169 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7170}
7171
7172PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007173PyUnicode_DecodeMBCS(const char *s,
7174 Py_ssize_t size,
7175 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007176{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007177 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7178}
7179
Victor Stinner3a50e702011-10-18 21:21:00 +02007180static DWORD
7181encode_code_page_flags(UINT code_page, const char *errors)
7182{
7183 if (code_page == CP_UTF8) {
7184 if (winver.dwMajorVersion >= 6)
7185 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7186 and later */
7187 return WC_ERR_INVALID_CHARS;
7188 else
7189 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7190 return 0;
7191 }
7192 else if (code_page == CP_UTF7) {
7193 /* CP_UTF7 only supports flags=0 */
7194 return 0;
7195 }
7196 else {
7197 if (errors != NULL && strcmp(errors, "replace") == 0)
7198 return 0;
7199 else
7200 return WC_NO_BEST_FIT_CHARS;
7201 }
7202}
7203
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007204/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 * Encode a Unicode string to a Windows code page into a byte string in strict
7206 * mode.
7207 *
7208 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7209 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007211static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007212encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007213 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007215{
Victor Stinner554f3f02010-06-16 23:33:54 +00007216 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 BOOL *pusedDefaultChar = &usedDefaultChar;
7218 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007219 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007220 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007221 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 const DWORD flags = encode_code_page_flags(code_page, NULL);
7223 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007224 /* Create a substring so that we can get the UTF-16 representation
7225 of just the slice under consideration. */
7226 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007227
Martin v. Löwis3d325192011-11-04 18:23:06 +01007228 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007229
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007231 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007233 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007234
Victor Stinner2fc507f2011-11-04 20:06:39 +01007235 substring = PyUnicode_Substring(unicode, offset, offset+len);
7236 if (substring == NULL)
7237 return -1;
7238 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7239 if (p == NULL) {
7240 Py_DECREF(substring);
7241 return -1;
7242 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007243
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007244 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 outsize = WideCharToMultiByte(code_page, flags,
7246 p, size,
7247 NULL, 0,
7248 NULL, pusedDefaultChar);
7249 if (outsize <= 0)
7250 goto error;
7251 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007252 if (pusedDefaultChar && *pusedDefaultChar) {
7253 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007255 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007256
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007259 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007260 if (*outbytes == NULL) {
7261 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007262 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007263 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007265 }
7266 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 const Py_ssize_t n = PyBytes_Size(*outbytes);
7269 if (outsize > PY_SSIZE_T_MAX - n) {
7270 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007271 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007272 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007274 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7275 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007277 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007279 }
7280
7281 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007282 outsize = WideCharToMultiByte(code_page, flags,
7283 p, size,
7284 out, outsize,
7285 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007286 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007287 if (outsize <= 0)
7288 goto error;
7289 if (pusedDefaultChar && *pusedDefaultChar)
7290 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007291 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007292
Victor Stinner3a50e702011-10-18 21:21:00 +02007293error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007294 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007295 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7296 return -2;
7297 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007298 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007299}
7300
Victor Stinner3a50e702011-10-18 21:21:00 +02007301/*
7302 * Encode a Unicode string to a Windows code page into a byte string using a
7303 * error handler.
7304 *
7305 * Returns consumed characters if succeed, or raise a WindowsError and returns
7306 * -1 on other error.
7307 */
7308static int
7309encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007310 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007311 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007312{
Victor Stinner3a50e702011-10-18 21:21:00 +02007313 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007314 Py_ssize_t pos = unicode_offset;
7315 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 /* Ideally, we should get reason from FormatMessage. This is the Windows
7317 2000 English version of the message. */
7318 const char *reason = "invalid character";
7319 /* 4=maximum length of a UTF-8 sequence */
7320 char buffer[4];
7321 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7322 Py_ssize_t outsize;
7323 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007324 PyObject *errorHandler = NULL;
7325 PyObject *exc = NULL;
7326 PyObject *encoding_obj = NULL;
7327 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007328 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 PyObject *rep;
7330 int ret = -1;
7331
7332 assert(insize > 0);
7333
7334 encoding = code_page_name(code_page, &encoding_obj);
7335 if (encoding == NULL)
7336 return -1;
7337
7338 if (errors == NULL || strcmp(errors, "strict") == 0) {
7339 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7340 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007341 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 if (exc != NULL) {
7343 PyCodec_StrictErrors(exc);
7344 Py_DECREF(exc);
7345 }
7346 Py_XDECREF(encoding_obj);
7347 return -1;
7348 }
7349
7350 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7351 pusedDefaultChar = &usedDefaultChar;
7352 else
7353 pusedDefaultChar = NULL;
7354
7355 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7356 PyErr_NoMemory();
7357 goto error;
7358 }
7359 outsize = insize * Py_ARRAY_LENGTH(buffer);
7360
7361 if (*outbytes == NULL) {
7362 /* Create string object */
7363 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7364 if (*outbytes == NULL)
7365 goto error;
7366 out = PyBytes_AS_STRING(*outbytes);
7367 }
7368 else {
7369 /* Extend string object */
7370 Py_ssize_t n = PyBytes_Size(*outbytes);
7371 if (n > PY_SSIZE_T_MAX - outsize) {
7372 PyErr_NoMemory();
7373 goto error;
7374 }
7375 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7376 goto error;
7377 out = PyBytes_AS_STRING(*outbytes) + n;
7378 }
7379
7380 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007381 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007382 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007383 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7384 wchar_t chars[2];
7385 int charsize;
7386 if (ch < 0x10000) {
7387 chars[0] = (wchar_t)ch;
7388 charsize = 1;
7389 }
7390 else {
7391 ch -= 0x10000;
7392 chars[0] = 0xd800 + (ch >> 10);
7393 chars[1] = 0xdc00 + (ch & 0x3ff);
7394 charsize = 2;
7395 }
7396
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007398 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 buffer, Py_ARRAY_LENGTH(buffer),
7400 NULL, pusedDefaultChar);
7401 if (outsize > 0) {
7402 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7403 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007404 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007405 memcpy(out, buffer, outsize);
7406 out += outsize;
7407 continue;
7408 }
7409 }
7410 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7411 PyErr_SetFromWindowsErr(0);
7412 goto error;
7413 }
7414
Victor Stinner3a50e702011-10-18 21:21:00 +02007415 rep = unicode_encode_call_errorhandler(
7416 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007417 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007418 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007419 if (rep == NULL)
7420 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007421 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007422
7423 if (PyBytes_Check(rep)) {
7424 outsize = PyBytes_GET_SIZE(rep);
7425 if (outsize != 1) {
7426 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7427 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7428 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7429 Py_DECREF(rep);
7430 goto error;
7431 }
7432 out = PyBytes_AS_STRING(*outbytes) + offset;
7433 }
7434 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7435 out += outsize;
7436 }
7437 else {
7438 Py_ssize_t i;
7439 enum PyUnicode_Kind kind;
7440 void *data;
7441
7442 if (PyUnicode_READY(rep) < 0) {
7443 Py_DECREF(rep);
7444 goto error;
7445 }
7446
7447 outsize = PyUnicode_GET_LENGTH(rep);
7448 if (outsize != 1) {
7449 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7450 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7451 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7452 Py_DECREF(rep);
7453 goto error;
7454 }
7455 out = PyBytes_AS_STRING(*outbytes) + offset;
7456 }
7457 kind = PyUnicode_KIND(rep);
7458 data = PyUnicode_DATA(rep);
7459 for (i=0; i < outsize; i++) {
7460 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7461 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007462 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007463 encoding, unicode,
7464 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 "unable to encode error handler result to ASCII");
7466 Py_DECREF(rep);
7467 goto error;
7468 }
7469 *out = (unsigned char)ch;
7470 out++;
7471 }
7472 }
7473 Py_DECREF(rep);
7474 }
7475 /* write a NUL byte */
7476 *out = 0;
7477 outsize = out - PyBytes_AS_STRING(*outbytes);
7478 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7479 if (_PyBytes_Resize(outbytes, outsize) < 0)
7480 goto error;
7481 ret = 0;
7482
7483error:
7484 Py_XDECREF(encoding_obj);
7485 Py_XDECREF(errorHandler);
7486 Py_XDECREF(exc);
7487 return ret;
7488}
7489
Victor Stinner3a50e702011-10-18 21:21:00 +02007490static PyObject *
7491encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007492 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 const char *errors)
7494{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007495 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007497 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007498 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007499
Victor Stinner2fc507f2011-11-04 20:06:39 +01007500 if (PyUnicode_READY(unicode) < 0)
7501 return NULL;
7502 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007503
Victor Stinner3a50e702011-10-18 21:21:00 +02007504 if (code_page < 0) {
7505 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7506 return NULL;
7507 }
7508
Martin v. Löwis3d325192011-11-04 18:23:06 +01007509 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007510 return PyBytes_FromStringAndSize(NULL, 0);
7511
Victor Stinner7581cef2011-11-03 22:32:33 +01007512 offset = 0;
7513 do
7514 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007515#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007516 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007517 chunks. */
7518 if (len > INT_MAX/2) {
7519 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007520 done = 0;
7521 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007522 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007523#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007524 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007525 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007526 done = 1;
7527 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007528
Victor Stinner76a31a62011-11-04 00:05:13 +01007529 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007530 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007531 errors);
7532 if (ret == -2)
7533 ret = encode_code_page_errors(code_page, &outbytes,
7534 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007535 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007536 if (ret < 0) {
7537 Py_XDECREF(outbytes);
7538 return NULL;
7539 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007540
Victor Stinner7581cef2011-11-03 22:32:33 +01007541 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007542 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007543 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007544
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 return outbytes;
7546}
7547
7548PyObject *
7549PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7550 Py_ssize_t size,
7551 const char *errors)
7552{
Victor Stinner7581cef2011-11-03 22:32:33 +01007553 PyObject *unicode, *res;
7554 unicode = PyUnicode_FromUnicode(p, size);
7555 if (unicode == NULL)
7556 return NULL;
7557 res = encode_code_page(CP_ACP, unicode, errors);
7558 Py_DECREF(unicode);
7559 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007560}
7561
7562PyObject *
7563PyUnicode_EncodeCodePage(int code_page,
7564 PyObject *unicode,
7565 const char *errors)
7566{
Victor Stinner7581cef2011-11-03 22:32:33 +01007567 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007568}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007569
Alexander Belopolsky40018472011-02-26 01:02:56 +00007570PyObject *
7571PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007572{
7573 if (!PyUnicode_Check(unicode)) {
7574 PyErr_BadArgument();
7575 return NULL;
7576 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007577 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007578}
7579
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007580#undef NEED_RETRY
7581
Victor Stinner99b95382011-07-04 14:23:54 +02007582#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007583
Guido van Rossumd57fd912000-03-10 22:53:23 +00007584/* --- Character Mapping Codec -------------------------------------------- */
7585
Alexander Belopolsky40018472011-02-26 01:02:56 +00007586PyObject *
7587PyUnicode_DecodeCharmap(const char *s,
7588 Py_ssize_t size,
7589 PyObject *mapping,
7590 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007592 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007593 Py_ssize_t startinpos;
7594 Py_ssize_t endinpos;
7595 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007596 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007597 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007598 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007599 PyObject *errorHandler = NULL;
7600 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007601
Guido van Rossumd57fd912000-03-10 22:53:23 +00007602 /* Default to Latin-1 */
7603 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007605
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007606 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007609 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007610 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007611 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007612 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007613 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007614 Py_ssize_t maplen;
7615 enum PyUnicode_Kind kind;
7616 void *data;
7617 Py_UCS4 x;
7618
7619 if (PyUnicode_READY(mapping) < 0)
7620 return NULL;
7621
7622 maplen = PyUnicode_GET_LENGTH(mapping);
7623 data = PyUnicode_DATA(mapping);
7624 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007625 while (s < e) {
7626 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007627
Benjamin Peterson29060642009-01-31 22:14:21 +00007628 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007629 x = PyUnicode_READ(kind, data, ch);
7630 else
7631 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007632
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007633 if (x == 0xfffe)
7634 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 startinpos = s-starts;
7637 endinpos = startinpos+1;
7638 if (unicode_decode_call_errorhandler(
7639 errors, &errorHandler,
7640 "charmap", "character maps to <undefined>",
7641 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007642 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007643 goto onError;
7644 }
7645 continue;
7646 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007647
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007648 if (unicode_putchar(&v, &outpos, x) < 0)
7649 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007651 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007652 }
7653 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 while (s < e) {
7655 unsigned char ch = *s;
7656 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007657
Benjamin Peterson29060642009-01-31 22:14:21 +00007658 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7659 w = PyLong_FromLong((long)ch);
7660 if (w == NULL)
7661 goto onError;
7662 x = PyObject_GetItem(mapping, w);
7663 Py_DECREF(w);
7664 if (x == NULL) {
7665 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7666 /* No mapping found means: mapping is undefined. */
7667 PyErr_Clear();
7668 x = Py_None;
7669 Py_INCREF(x);
7670 } else
7671 goto onError;
7672 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007673
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 /* Apply mapping */
7675 if (PyLong_Check(x)) {
7676 long value = PyLong_AS_LONG(x);
7677 if (value < 0 || value > 65535) {
7678 PyErr_SetString(PyExc_TypeError,
7679 "character mapping must be in range(65536)");
7680 Py_DECREF(x);
7681 goto onError;
7682 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007683 if (unicode_putchar(&v, &outpos, value) < 0)
7684 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 }
7686 else if (x == Py_None) {
7687 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 startinpos = s-starts;
7689 endinpos = startinpos+1;
7690 if (unicode_decode_call_errorhandler(
7691 errors, &errorHandler,
7692 "charmap", "character maps to <undefined>",
7693 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007694 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 Py_DECREF(x);
7696 goto onError;
7697 }
7698 Py_DECREF(x);
7699 continue;
7700 }
7701 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007702 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007703
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007704 if (PyUnicode_READY(x) < 0)
7705 goto onError;
7706 targetsize = PyUnicode_GET_LENGTH(x);
7707
7708 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007710 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007711 PyUnicode_READ_CHAR(x, 0)) < 0)
7712 goto onError;
7713 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 else if (targetsize > 1) {
7715 /* 1-n mapping */
7716 if (targetsize > extrachars) {
7717 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 Py_ssize_t needed = (targetsize - extrachars) + \
7719 (targetsize << 2);
7720 extrachars += needed;
7721 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007722 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007723 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 Py_DECREF(x);
7725 goto onError;
7726 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007728 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7729 goto onError;
7730 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7731 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007732 extrachars -= targetsize;
7733 }
7734 /* 1-0 mapping: skip the character */
7735 }
7736 else {
7737 /* wrong return value */
7738 PyErr_SetString(PyExc_TypeError,
7739 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007740 Py_DECREF(x);
7741 goto onError;
7742 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007743 Py_DECREF(x);
7744 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007745 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007747 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007748 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007749 Py_XDECREF(errorHandler);
7750 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007751 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007752
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007754 Py_XDECREF(errorHandler);
7755 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007756 Py_XDECREF(v);
7757 return NULL;
7758}
7759
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007760/* Charmap encoding: the lookup table */
7761
Alexander Belopolsky40018472011-02-26 01:02:56 +00007762struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007763 PyObject_HEAD
7764 unsigned char level1[32];
7765 int count2, count3;
7766 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767};
7768
7769static PyObject*
7770encoding_map_size(PyObject *obj, PyObject* args)
7771{
7772 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007773 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775}
7776
7777static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007778 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 PyDoc_STR("Return the size (in bytes) of this object") },
7780 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007781};
7782
7783static void
7784encoding_map_dealloc(PyObject* o)
7785{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007786 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007787}
7788
7789static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007790 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 "EncodingMap", /*tp_name*/
7792 sizeof(struct encoding_map), /*tp_basicsize*/
7793 0, /*tp_itemsize*/
7794 /* methods */
7795 encoding_map_dealloc, /*tp_dealloc*/
7796 0, /*tp_print*/
7797 0, /*tp_getattr*/
7798 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007799 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 0, /*tp_repr*/
7801 0, /*tp_as_number*/
7802 0, /*tp_as_sequence*/
7803 0, /*tp_as_mapping*/
7804 0, /*tp_hash*/
7805 0, /*tp_call*/
7806 0, /*tp_str*/
7807 0, /*tp_getattro*/
7808 0, /*tp_setattro*/
7809 0, /*tp_as_buffer*/
7810 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7811 0, /*tp_doc*/
7812 0, /*tp_traverse*/
7813 0, /*tp_clear*/
7814 0, /*tp_richcompare*/
7815 0, /*tp_weaklistoffset*/
7816 0, /*tp_iter*/
7817 0, /*tp_iternext*/
7818 encoding_map_methods, /*tp_methods*/
7819 0, /*tp_members*/
7820 0, /*tp_getset*/
7821 0, /*tp_base*/
7822 0, /*tp_dict*/
7823 0, /*tp_descr_get*/
7824 0, /*tp_descr_set*/
7825 0, /*tp_dictoffset*/
7826 0, /*tp_init*/
7827 0, /*tp_alloc*/
7828 0, /*tp_new*/
7829 0, /*tp_free*/
7830 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007831};
7832
7833PyObject*
7834PyUnicode_BuildEncodingMap(PyObject* string)
7835{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007836 PyObject *result;
7837 struct encoding_map *mresult;
7838 int i;
7839 int need_dict = 0;
7840 unsigned char level1[32];
7841 unsigned char level2[512];
7842 unsigned char *mlevel1, *mlevel2, *mlevel3;
7843 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 int kind;
7845 void *data;
7846 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007849 PyErr_BadArgument();
7850 return NULL;
7851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007852 kind = PyUnicode_KIND(string);
7853 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007854 memset(level1, 0xFF, sizeof level1);
7855 memset(level2, 0xFF, sizeof level2);
7856
7857 /* If there isn't a one-to-one mapping of NULL to \0,
7858 or if there are non-BMP characters, we need to use
7859 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861 need_dict = 1;
7862 for (i = 1; i < 256; i++) {
7863 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007864 ch = PyUnicode_READ(kind, data, i);
7865 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007866 need_dict = 1;
7867 break;
7868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007869 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007870 /* unmapped character */
7871 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007872 l1 = ch >> 11;
7873 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007874 if (level1[l1] == 0xFF)
7875 level1[l1] = count2++;
7876 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007877 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007878 }
7879
7880 if (count2 >= 0xFF || count3 >= 0xFF)
7881 need_dict = 1;
7882
7883 if (need_dict) {
7884 PyObject *result = PyDict_New();
7885 PyObject *key, *value;
7886 if (!result)
7887 return NULL;
7888 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007889 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007890 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 if (!key || !value)
7892 goto failed1;
7893 if (PyDict_SetItem(result, key, value) == -1)
7894 goto failed1;
7895 Py_DECREF(key);
7896 Py_DECREF(value);
7897 }
7898 return result;
7899 failed1:
7900 Py_XDECREF(key);
7901 Py_XDECREF(value);
7902 Py_DECREF(result);
7903 return NULL;
7904 }
7905
7906 /* Create a three-level trie */
7907 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7908 16*count2 + 128*count3 - 1);
7909 if (!result)
7910 return PyErr_NoMemory();
7911 PyObject_Init(result, &EncodingMapType);
7912 mresult = (struct encoding_map*)result;
7913 mresult->count2 = count2;
7914 mresult->count3 = count3;
7915 mlevel1 = mresult->level1;
7916 mlevel2 = mresult->level23;
7917 mlevel3 = mresult->level23 + 16*count2;
7918 memcpy(mlevel1, level1, 32);
7919 memset(mlevel2, 0xFF, 16*count2);
7920 memset(mlevel3, 0, 128*count3);
7921 count3 = 0;
7922 for (i = 1; i < 256; i++) {
7923 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007925 /* unmapped character */
7926 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007927 o1 = PyUnicode_READ(kind, data, i)>>11;
7928 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007929 i2 = 16*mlevel1[o1] + o2;
7930 if (mlevel2[i2] == 0xFF)
7931 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007932 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007933 i3 = 128*mlevel2[i2] + o3;
7934 mlevel3[i3] = i;
7935 }
7936 return result;
7937}
7938
7939static int
Victor Stinner22168992011-11-20 17:09:18 +01007940encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007941{
7942 struct encoding_map *map = (struct encoding_map*)mapping;
7943 int l1 = c>>11;
7944 int l2 = (c>>7) & 0xF;
7945 int l3 = c & 0x7F;
7946 int i;
7947
Victor Stinner22168992011-11-20 17:09:18 +01007948 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007949 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007950 if (c == 0)
7951 return 0;
7952 /* level 1*/
7953 i = map->level1[l1];
7954 if (i == 0xFF) {
7955 return -1;
7956 }
7957 /* level 2*/
7958 i = map->level23[16*i+l2];
7959 if (i == 0xFF) {
7960 return -1;
7961 }
7962 /* level 3 */
7963 i = map->level23[16*map->count2 + 128*i + l3];
7964 if (i == 0) {
7965 return -1;
7966 }
7967 return i;
7968}
7969
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007970/* Lookup the character ch in the mapping. If the character
7971 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007972 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007973static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007974charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975{
Christian Heimes217cfd12007-12-02 14:31:20 +00007976 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007977 PyObject *x;
7978
7979 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007981 x = PyObject_GetItem(mapping, w);
7982 Py_DECREF(w);
7983 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7985 /* No mapping found means: mapping is undefined. */
7986 PyErr_Clear();
7987 x = Py_None;
7988 Py_INCREF(x);
7989 return x;
7990 } else
7991 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007992 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007993 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007995 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 long value = PyLong_AS_LONG(x);
7997 if (value < 0 || value > 255) {
7998 PyErr_SetString(PyExc_TypeError,
7999 "character mapping must be in range(256)");
8000 Py_DECREF(x);
8001 return NULL;
8002 }
8003 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008004 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008005 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008007 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 /* wrong return value */
8009 PyErr_Format(PyExc_TypeError,
8010 "character mapping must return integer, bytes or None, not %.400s",
8011 x->ob_type->tp_name);
8012 Py_DECREF(x);
8013 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 }
8015}
8016
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008017static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008018charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008019{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008020 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8021 /* exponentially overallocate to minimize reallocations */
8022 if (requiredsize < 2*outsize)
8023 requiredsize = 2*outsize;
8024 if (_PyBytes_Resize(outobj, requiredsize))
8025 return -1;
8026 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008027}
8028
Benjamin Peterson14339b62009-01-31 16:36:08 +00008029typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008031} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008032/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008033 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008034 space is available. Return a new reference to the object that
8035 was put in the output buffer, or Py_None, if the mapping was undefined
8036 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008037 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008038static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008039charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008040 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008041{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008042 PyObject *rep;
8043 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008044 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008045
Christian Heimes90aa7642007-12-19 02:45:37 +00008046 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008048 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049 if (res == -1)
8050 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 if (outsize<requiredsize)
8052 if (charmapencode_resize(outobj, outpos, requiredsize))
8053 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008054 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 outstart[(*outpos)++] = (char)res;
8056 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008057 }
8058
8059 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008060 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008061 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008062 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 Py_DECREF(rep);
8064 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008065 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 if (PyLong_Check(rep)) {
8067 Py_ssize_t requiredsize = *outpos+1;
8068 if (outsize<requiredsize)
8069 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8070 Py_DECREF(rep);
8071 return enc_EXCEPTION;
8072 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008073 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008075 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 else {
8077 const char *repchars = PyBytes_AS_STRING(rep);
8078 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8079 Py_ssize_t requiredsize = *outpos+repsize;
8080 if (outsize<requiredsize)
8081 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8082 Py_DECREF(rep);
8083 return enc_EXCEPTION;
8084 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008085 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 memcpy(outstart + *outpos, repchars, repsize);
8087 *outpos += repsize;
8088 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090 Py_DECREF(rep);
8091 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092}
8093
8094/* handle an error in PyUnicode_EncodeCharmap
8095 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008096static int
8097charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008098 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008100 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008101 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102{
8103 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008104 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008105 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008106 enum PyUnicode_Kind kind;
8107 void *data;
8108 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008110 Py_ssize_t collstartpos = *inpos;
8111 Py_ssize_t collendpos = *inpos+1;
8112 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008113 char *encoding = "charmap";
8114 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008115 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008116 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008117 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008118
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008119 if (PyUnicode_READY(unicode) < 0)
8120 return -1;
8121 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008122 /* find all unencodable characters */
8123 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008125 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008126 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008127 val = encoding_map_lookup(ch, mapping);
8128 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 break;
8130 ++collendpos;
8131 continue;
8132 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008133
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008134 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8135 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 if (rep==NULL)
8137 return -1;
8138 else if (rep!=Py_None) {
8139 Py_DECREF(rep);
8140 break;
8141 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008142 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144 }
8145 /* cache callback name lookup
8146 * (if not done yet, i.e. it's the first error) */
8147 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 if ((errors==NULL) || (!strcmp(errors, "strict")))
8149 *known_errorHandler = 1;
8150 else if (!strcmp(errors, "replace"))
8151 *known_errorHandler = 2;
8152 else if (!strcmp(errors, "ignore"))
8153 *known_errorHandler = 3;
8154 else if (!strcmp(errors, "xmlcharrefreplace"))
8155 *known_errorHandler = 4;
8156 else
8157 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008158 }
8159 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008161 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008162 return -1;
8163 case 2: /* replace */
8164 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 x = charmapencode_output('?', mapping, res, respos);
8166 if (x==enc_EXCEPTION) {
8167 return -1;
8168 }
8169 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008170 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 return -1;
8172 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 }
8174 /* fall through */
8175 case 3: /* ignore */
8176 *inpos = collendpos;
8177 break;
8178 case 4: /* xmlcharrefreplace */
8179 /* generate replacement (temporarily (mis)uses p) */
8180 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008181 char buffer[2+29+1+1];
8182 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008183 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 for (cp = buffer; *cp; ++cp) {
8185 x = charmapencode_output(*cp, mapping, res, respos);
8186 if (x==enc_EXCEPTION)
8187 return -1;
8188 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008189 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 return -1;
8191 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 }
8193 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008194 *inpos = collendpos;
8195 break;
8196 default:
8197 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008198 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008200 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008202 if (PyBytes_Check(repunicode)) {
8203 /* Directly copy bytes result to output. */
8204 Py_ssize_t outsize = PyBytes_Size(*res);
8205 Py_ssize_t requiredsize;
8206 repsize = PyBytes_Size(repunicode);
8207 requiredsize = *respos + repsize;
8208 if (requiredsize > outsize)
8209 /* Make room for all additional bytes. */
8210 if (charmapencode_resize(res, respos, requiredsize)) {
8211 Py_DECREF(repunicode);
8212 return -1;
8213 }
8214 memcpy(PyBytes_AsString(*res) + *respos,
8215 PyBytes_AsString(repunicode), repsize);
8216 *respos += repsize;
8217 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008218 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008219 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008220 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008221 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008222 if (PyUnicode_READY(repunicode) < 0) {
8223 Py_DECREF(repunicode);
8224 return -1;
8225 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008226 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008227 data = PyUnicode_DATA(repunicode);
8228 kind = PyUnicode_KIND(repunicode);
8229 for (index = 0; index < repsize; index++) {
8230 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8231 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008233 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 return -1;
8235 }
8236 else if (x==enc_FAILED) {
8237 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008238 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 return -1;
8240 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 }
8242 *inpos = newpos;
8243 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244 }
8245 return 0;
8246}
8247
Alexander Belopolsky40018472011-02-26 01:02:56 +00008248PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008249_PyUnicode_EncodeCharmap(PyObject *unicode,
8250 PyObject *mapping,
8251 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 /* output object */
8254 PyObject *res = NULL;
8255 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008256 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008257 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008259 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 PyObject *errorHandler = NULL;
8261 PyObject *exc = NULL;
8262 /* the following variable is used for caching string comparisons
8263 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8264 * 3=ignore, 4=xmlcharrefreplace */
8265 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008266
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008267 if (PyUnicode_READY(unicode) < 0)
8268 return NULL;
8269 size = PyUnicode_GET_LENGTH(unicode);
8270
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271 /* Default to Latin-1 */
8272 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008273 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008274
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 /* allocate enough for a simple encoding without
8276 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008277 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 if (res == NULL)
8279 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008280 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008281 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008284 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008285 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008286 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 if (x==enc_EXCEPTION) /* error */
8288 goto onError;
8289 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008290 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 &exc,
8292 &known_errorHandler, &errorHandler, errors,
8293 &res, &respos)) {
8294 goto onError;
8295 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008296 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 else
8298 /* done with this character => adjust input position */
8299 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008303 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008304 if (_PyBytes_Resize(&res, respos) < 0)
8305 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 Py_XDECREF(exc);
8308 Py_XDECREF(errorHandler);
8309 return res;
8310
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008312 Py_XDECREF(res);
8313 Py_XDECREF(exc);
8314 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 return NULL;
8316}
8317
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008318/* Deprecated */
8319PyObject *
8320PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8321 Py_ssize_t size,
8322 PyObject *mapping,
8323 const char *errors)
8324{
8325 PyObject *result;
8326 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8327 if (unicode == NULL)
8328 return NULL;
8329 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8330 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008331 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008332}
8333
Alexander Belopolsky40018472011-02-26 01:02:56 +00008334PyObject *
8335PyUnicode_AsCharmapString(PyObject *unicode,
8336 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337{
8338 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 PyErr_BadArgument();
8340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008342 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343}
8344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008345/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008346static void
8347make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008348 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008349 Py_ssize_t startpos, Py_ssize_t endpos,
8350 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 *exceptionObject = _PyUnicodeTranslateError_Create(
8354 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008355 }
8356 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8358 goto onError;
8359 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8360 goto onError;
8361 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8362 goto onError;
8363 return;
8364 onError:
8365 Py_DECREF(*exceptionObject);
8366 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367 }
8368}
8369
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008371static void
8372raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008374 Py_ssize_t startpos, Py_ssize_t endpos,
8375 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376{
8377 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381}
8382
8383/* error handling callback helper:
8384 build arguments, call the callback and check the arguments,
8385 put the result into newpos and return the replacement string, which
8386 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008387static PyObject *
8388unicode_translate_call_errorhandler(const char *errors,
8389 PyObject **errorHandler,
8390 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008392 Py_ssize_t startpos, Py_ssize_t endpos,
8393 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008395 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008397 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398 PyObject *restuple;
8399 PyObject *resunicode;
8400
8401 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 }
8406
8407 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008409 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411
8412 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008417 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 Py_DECREF(restuple);
8419 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420 }
8421 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 &resunicode, &i_newpos)) {
8423 Py_DECREF(restuple);
8424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008426 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008428 else
8429 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008430 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8432 Py_DECREF(restuple);
8433 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008434 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435 Py_INCREF(resunicode);
8436 Py_DECREF(restuple);
8437 return resunicode;
8438}
8439
8440/* Lookup the character ch in the mapping and put the result in result,
8441 which must be decrefed by the caller.
8442 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008443static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008445{
Christian Heimes217cfd12007-12-02 14:31:20 +00008446 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447 PyObject *x;
8448
8449 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008451 x = PyObject_GetItem(mapping, w);
8452 Py_DECREF(w);
8453 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8455 /* No mapping found means: use 1:1 mapping. */
8456 PyErr_Clear();
8457 *result = NULL;
8458 return 0;
8459 } else
8460 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 }
8462 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 *result = x;
8464 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008466 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 long value = PyLong_AS_LONG(x);
8468 long max = PyUnicode_GetMax();
8469 if (value < 0 || value > max) {
8470 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008471 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 Py_DECREF(x);
8473 return -1;
8474 }
8475 *result = x;
8476 return 0;
8477 }
8478 else if (PyUnicode_Check(x)) {
8479 *result = x;
8480 return 0;
8481 }
8482 else {
8483 /* wrong return value */
8484 PyErr_SetString(PyExc_TypeError,
8485 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008486 Py_DECREF(x);
8487 return -1;
8488 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489}
8490/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 if not reallocate and adjust various state variables.
8492 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008493static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008494charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008498 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 /* exponentially overallocate to minimize reallocations */
8500 if (requiredsize < 2 * oldsize)
8501 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8503 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506 }
8507 return 0;
8508}
8509/* lookup the character, put the result in the output string and adjust
8510 various state variables. Return a new reference to the object that
8511 was put in the output buffer in *result, or Py_None, if the mapping was
8512 undefined (in which case no character was written).
8513 The called must decref result.
8514 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008515static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8517 PyObject *mapping, Py_UCS4 **output,
8518 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008519 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8522 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008527 }
8528 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008530 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 }
8534 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 Py_ssize_t repsize;
8536 if (PyUnicode_READY(*res) == -1)
8537 return -1;
8538 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 if (repsize==1) {
8540 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 }
8543 else if (repsize!=0) {
8544 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 Py_ssize_t requiredsize = *opos +
8546 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 Py_ssize_t i;
8549 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 for(i = 0; i < repsize; i++)
8552 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 }
8555 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 return 0;
8558}
8559
Alexander Belopolsky40018472011-02-26 01:02:56 +00008560PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561_PyUnicode_TranslateCharmap(PyObject *input,
8562 PyObject *mapping,
8563 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 /* input object */
8566 char *idata;
8567 Py_ssize_t size, i;
8568 int kind;
8569 /* output buffer */
8570 Py_UCS4 *output = NULL;
8571 Py_ssize_t osize;
8572 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 char *reason = "character maps to <undefined>";
8576 PyObject *errorHandler = NULL;
8577 PyObject *exc = NULL;
8578 /* the following variable is used for caching string comparisons
8579 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8580 * 3=ignore, 4=xmlcharrefreplace */
8581 int known_errorHandler = -1;
8582
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 PyErr_BadArgument();
8585 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588 if (PyUnicode_READY(input) == -1)
8589 return NULL;
8590 idata = (char*)PyUnicode_DATA(input);
8591 kind = PyUnicode_KIND(input);
8592 size = PyUnicode_GET_LENGTH(input);
8593 i = 0;
8594
8595 if (size == 0) {
8596 Py_INCREF(input);
8597 return input;
8598 }
8599
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600 /* allocate enough for a simple 1:1 translation without
8601 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 osize = size;
8603 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8604 opos = 0;
8605 if (output == NULL) {
8606 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 /* try to encode it */
8612 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 if (charmaptranslate_output(input, i, mapping,
8614 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 Py_XDECREF(x);
8616 goto onError;
8617 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008618 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 else { /* untranslatable character */
8622 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8623 Py_ssize_t repsize;
8624 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 Py_ssize_t collstart = i;
8628 Py_ssize_t collend = i+1;
8629 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008630
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 while (collend < size) {
8633 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 goto onError;
8635 Py_XDECREF(x);
8636 if (x!=Py_None)
8637 break;
8638 ++collend;
8639 }
8640 /* cache callback name lookup
8641 * (if not done yet, i.e. it's the first error) */
8642 if (known_errorHandler==-1) {
8643 if ((errors==NULL) || (!strcmp(errors, "strict")))
8644 known_errorHandler = 1;
8645 else if (!strcmp(errors, "replace"))
8646 known_errorHandler = 2;
8647 else if (!strcmp(errors, "ignore"))
8648 known_errorHandler = 3;
8649 else if (!strcmp(errors, "xmlcharrefreplace"))
8650 known_errorHandler = 4;
8651 else
8652 known_errorHandler = 0;
8653 }
8654 switch (known_errorHandler) {
8655 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 raise_translate_exception(&exc, input, collstart,
8657 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008658 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 case 2: /* replace */
8660 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008661 for (coll = collstart; coll<collend; coll++)
8662 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 /* fall through */
8664 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 break;
8667 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 /* generate replacement (temporarily (mis)uses i) */
8669 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 char buffer[2+29+1+1];
8671 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8673 if (charmaptranslate_makespace(&output, &osize,
8674 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 goto onError;
8676 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 break;
8681 default:
8682 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683 reason, input, &exc,
8684 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008685 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008687 if (PyUnicode_READY(repunicode) < 0) {
8688 Py_DECREF(repunicode);
8689 goto onError;
8690 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 repsize = PyUnicode_GET_LENGTH(repunicode);
8693 if (charmaptranslate_makespace(&output, &osize,
8694 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 Py_DECREF(repunicode);
8696 goto onError;
8697 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 for (uni2 = 0; repsize-->0; ++uni2)
8699 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8700 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008702 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008703 }
8704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8706 if (!res)
8707 goto onError;
8708 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008709 Py_XDECREF(exc);
8710 Py_XDECREF(errorHandler);
8711 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715 Py_XDECREF(exc);
8716 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008717 return NULL;
8718}
8719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008720/* Deprecated. Use PyUnicode_Translate instead. */
8721PyObject *
8722PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8723 Py_ssize_t size,
8724 PyObject *mapping,
8725 const char *errors)
8726{
8727 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8728 if (!unicode)
8729 return NULL;
8730 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8731}
8732
Alexander Belopolsky40018472011-02-26 01:02:56 +00008733PyObject *
8734PyUnicode_Translate(PyObject *str,
8735 PyObject *mapping,
8736 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737{
8738 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008739
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740 str = PyUnicode_FromObject(str);
8741 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008743 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744 Py_DECREF(str);
8745 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008746
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748 Py_XDECREF(str);
8749 return NULL;
8750}
Tim Petersced69f82003-09-16 20:30:58 +00008751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008752static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008753fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008754{
8755 /* No need to call PyUnicode_READY(self) because this function is only
8756 called as a callback from fixup() which does it already. */
8757 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8758 const int kind = PyUnicode_KIND(self);
8759 void *data = PyUnicode_DATA(self);
8760 Py_UCS4 maxchar = 0, ch, fixed;
8761 Py_ssize_t i;
8762
8763 for (i = 0; i < len; ++i) {
8764 ch = PyUnicode_READ(kind, data, i);
8765 fixed = 0;
8766 if (ch > 127) {
8767 if (Py_UNICODE_ISSPACE(ch))
8768 fixed = ' ';
8769 else {
8770 const int decimal = Py_UNICODE_TODECIMAL(ch);
8771 if (decimal >= 0)
8772 fixed = '0' + decimal;
8773 }
8774 if (fixed != 0) {
8775 if (fixed > maxchar)
8776 maxchar = fixed;
8777 PyUnicode_WRITE(kind, data, i, fixed);
8778 }
8779 else if (ch > maxchar)
8780 maxchar = ch;
8781 }
8782 else if (ch > maxchar)
8783 maxchar = ch;
8784 }
8785
8786 return maxchar;
8787}
8788
8789PyObject *
8790_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8791{
8792 if (!PyUnicode_Check(unicode)) {
8793 PyErr_BadInternalCall();
8794 return NULL;
8795 }
8796 if (PyUnicode_READY(unicode) == -1)
8797 return NULL;
8798 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8799 /* If the string is already ASCII, just return the same string */
8800 Py_INCREF(unicode);
8801 return unicode;
8802 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008803 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804}
8805
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008806PyObject *
8807PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8808 Py_ssize_t length)
8809{
Victor Stinnerf0124502011-11-21 23:12:56 +01008810 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008811 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008812 Py_UCS4 maxchar;
8813 enum PyUnicode_Kind kind;
8814 void *data;
8815
8816 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008817 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008818 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008819 if (ch > 127) {
8820 int decimal = Py_UNICODE_TODECIMAL(ch);
8821 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008822 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008823 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008824 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008825 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008826
8827 /* Copy to a new string */
8828 decimal = PyUnicode_New(length, maxchar);
8829 if (decimal == NULL)
8830 return decimal;
8831 kind = PyUnicode_KIND(decimal);
8832 data = PyUnicode_DATA(decimal);
8833 /* Iterate over code points */
8834 for (i = 0; i < length; i++) {
8835 Py_UNICODE ch = s[i];
8836 if (ch > 127) {
8837 int decimal = Py_UNICODE_TODECIMAL(ch);
8838 if (decimal >= 0)
8839 ch = '0' + decimal;
8840 }
8841 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008843 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008844}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008845/* --- Decimal Encoder ---------------------------------------------------- */
8846
Alexander Belopolsky40018472011-02-26 01:02:56 +00008847int
8848PyUnicode_EncodeDecimal(Py_UNICODE *s,
8849 Py_ssize_t length,
8850 char *output,
8851 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008852{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008853 PyObject *errorHandler = NULL;
8854 PyObject *exc = NULL;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008855 PyObject *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008856 const char *encoding = "decimal";
8857 const char *reason = "invalid decimal Unicode string";
8858 /* the following variable is used for caching string comparisons
8859 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8860 int known_errorHandler = -1;
Victor Stinner42bf7752011-11-21 22:52:58 +01008861 Py_ssize_t i, j;
8862 enum PyUnicode_Kind kind;
8863 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008864
8865 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008866 PyErr_BadArgument();
8867 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008868 }
8869
Victor Stinner42bf7752011-11-21 22:52:58 +01008870 unicode = PyUnicode_FromUnicode(s, length);
8871 if (unicode == NULL)
8872 return -1;
8873
8874 if (PyUnicode_READY(unicode) < 0)
8875 goto onError;
8876 kind = PyUnicode_KIND(unicode);
8877 data = PyUnicode_DATA(unicode);
8878
Victor Stinnerb84d7232011-11-22 01:50:07 +01008879 for (i=0; i < length; ) {
Victor Stinner42bf7752011-11-21 22:52:58 +01008880 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 int decimal;
Victor Stinner42bf7752011-11-21 22:52:58 +01008882 Py_ssize_t startpos, endpos;
Tim Petersced69f82003-09-16 20:30:58 +00008883
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008885 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008886 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008888 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 decimal = Py_UNICODE_TODECIMAL(ch);
8890 if (decimal >= 0) {
8891 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008892 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 continue;
8894 }
8895 if (0 < ch && ch < 256) {
8896 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008897 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 continue;
8899 }
8900 /* All other characters are considered unencodable */
Victor Stinner42bf7752011-11-21 22:52:58 +01008901 startpos = i;
8902 endpos = i+1;
8903 for (; endpos < length; endpos++) {
8904 ch = PyUnicode_READ(kind, data, endpos);
8905 if ((0 < ch && ch < 256) ||
Victor Stinnerb84d7232011-11-22 01:50:07 +01008906 Py_UNICODE_ISSPACE(ch) ||
8907 0 <= Py_UNICODE_TODECIMAL(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +00008908 break;
8909 }
8910 /* cache callback name lookup
8911 * (if not done yet, i.e. it's the first error) */
8912 if (known_errorHandler==-1) {
8913 if ((errors==NULL) || (!strcmp(errors, "strict")))
8914 known_errorHandler = 1;
8915 else if (!strcmp(errors, "replace"))
8916 known_errorHandler = 2;
8917 else if (!strcmp(errors, "ignore"))
8918 known_errorHandler = 3;
8919 else if (!strcmp(errors, "xmlcharrefreplace"))
8920 known_errorHandler = 4;
8921 else
8922 known_errorHandler = 0;
8923 }
8924 switch (known_errorHandler) {
8925 case 1: /* strict */
Victor Stinner42bf7752011-11-21 22:52:58 +01008926 raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 goto onError;
8928 case 2: /* replace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008929 for (j=startpos; j < endpos; j++)
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 *output++ = '?';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008931 i = endpos;
8932 break;
Benjamin Peterson29060642009-01-31 22:14:21 +00008933 case 3: /* ignore */
Victor Stinner42bf7752011-11-21 22:52:58 +01008934 i = endpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 break;
8936 case 4: /* xmlcharrefreplace */
Victor Stinner42bf7752011-11-21 22:52:58 +01008937 /* generate replacement */
8938 for (j=startpos; j < endpos; j++) {
8939 ch = PyUnicode_READ(kind, data, i);
8940 output += sprintf(output, "&#%d;", (int)ch);
8941 i++;
8942 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 break;
8944 default:
Victor Stinner42bf7752011-11-21 22:52:58 +01008945 {
8946 PyObject *repunicode;
8947 Py_ssize_t repsize, newpos, k;
8948 enum PyUnicode_Kind repkind;
8949 void *repdata;
8950
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008952 encoding, reason, unicode, &exc,
Victor Stinner42bf7752011-11-21 22:52:58 +01008953 startpos, endpos, &newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 if (repunicode == NULL)
8955 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008956 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008957 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008958 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8959 Py_DECREF(repunicode);
8960 goto onError;
8961 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008962 if (PyUnicode_READY(repunicode) < 0) {
8963 Py_DECREF(repunicode);
8964 goto onError;
8965 }
8966 repkind = PyUnicode_KIND(repunicode);
8967 repdata = PyUnicode_DATA(repunicode);
8968
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 /* generate replacement */
8970 repsize = PyUnicode_GET_SIZE(repunicode);
Victor Stinner42bf7752011-11-21 22:52:58 +01008971 for (k=0; k<repsize; k++) {
8972 ch = PyUnicode_READ(repkind, repdata, k);
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 if (Py_UNICODE_ISSPACE(ch))
8974 *output++ = ' ';
8975 else {
8976 decimal = Py_UNICODE_TODECIMAL(ch);
8977 if (decimal >= 0)
8978 *output++ = '0' + decimal;
8979 else if (0 < ch && ch < 256)
8980 *output++ = (char)ch;
8981 else {
8982 Py_DECREF(repunicode);
8983 raise_encode_exception(&exc, encoding,
Victor Stinner42bf7752011-11-21 22:52:58 +01008984 unicode, startpos, endpos,
8985 reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008986 goto onError;
8987 }
8988 }
8989 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008990 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 Py_DECREF(repunicode);
8992 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008993 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008994 }
8995 /* 0-terminate the output string */
8996 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008997 Py_XDECREF(exc);
8998 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01008999 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009000 return 0;
9001
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009003 Py_XDECREF(exc);
9004 Py_XDECREF(errorHandler);
Victor Stinner42bf7752011-11-21 22:52:58 +01009005 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009006 return -1;
9007}
9008
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009/* --- Helpers ------------------------------------------------------------ */
9010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009012any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 Py_ssize_t start,
9014 Py_ssize_t end)
9015{
9016 int kind1, kind2, kind;
9017 void *buf1, *buf2;
9018 Py_ssize_t len1, len2, result;
9019
9020 kind1 = PyUnicode_KIND(s1);
9021 kind2 = PyUnicode_KIND(s2);
9022 kind = kind1 > kind2 ? kind1 : kind2;
9023 buf1 = PyUnicode_DATA(s1);
9024 buf2 = PyUnicode_DATA(s2);
9025 if (kind1 != kind)
9026 buf1 = _PyUnicode_AsKind(s1, kind);
9027 if (!buf1)
9028 return -2;
9029 if (kind2 != kind)
9030 buf2 = _PyUnicode_AsKind(s2, kind);
9031 if (!buf2) {
9032 if (kind1 != kind) PyMem_Free(buf1);
9033 return -2;
9034 }
9035 len1 = PyUnicode_GET_LENGTH(s1);
9036 len2 = PyUnicode_GET_LENGTH(s2);
9037
Victor Stinner794d5672011-10-10 03:21:36 +02009038 if (direction > 0) {
9039 switch(kind) {
9040 case PyUnicode_1BYTE_KIND:
9041 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9042 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9043 else
9044 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9045 break;
9046 case PyUnicode_2BYTE_KIND:
9047 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9048 break;
9049 case PyUnicode_4BYTE_KIND:
9050 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9051 break;
9052 default:
9053 assert(0); result = -2;
9054 }
9055 }
9056 else {
9057 switch(kind) {
9058 case PyUnicode_1BYTE_KIND:
9059 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9060 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9061 else
9062 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9063 break;
9064 case PyUnicode_2BYTE_KIND:
9065 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9066 break;
9067 case PyUnicode_4BYTE_KIND:
9068 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9069 break;
9070 default:
9071 assert(0); result = -2;
9072 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 }
9074
9075 if (kind1 != kind)
9076 PyMem_Free(buf1);
9077 if (kind2 != kind)
9078 PyMem_Free(buf2);
9079
9080 return result;
9081}
9082
9083Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009084_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 Py_ssize_t n_buffer,
9086 void *digits, Py_ssize_t n_digits,
9087 Py_ssize_t min_width,
9088 const char *grouping,
9089 const char *thousands_sep)
9090{
9091 switch(kind) {
9092 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009093 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9094 return _PyUnicode_ascii_InsertThousandsGrouping(
9095 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9096 min_width, grouping, thousands_sep);
9097 else
9098 return _PyUnicode_ucs1_InsertThousandsGrouping(
9099 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9100 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 case PyUnicode_2BYTE_KIND:
9102 return _PyUnicode_ucs2_InsertThousandsGrouping(
9103 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9104 min_width, grouping, thousands_sep);
9105 case PyUnicode_4BYTE_KIND:
9106 return _PyUnicode_ucs4_InsertThousandsGrouping(
9107 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9108 min_width, grouping, thousands_sep);
9109 }
9110 assert(0);
9111 return -1;
9112}
9113
9114
Thomas Wouters477c8d52006-05-27 19:21:47 +00009115/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009116#define ADJUST_INDICES(start, end, len) \
9117 if (end > len) \
9118 end = len; \
9119 else if (end < 0) { \
9120 end += len; \
9121 if (end < 0) \
9122 end = 0; \
9123 } \
9124 if (start < 0) { \
9125 start += len; \
9126 if (start < 0) \
9127 start = 0; \
9128 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009129
Alexander Belopolsky40018472011-02-26 01:02:56 +00009130Py_ssize_t
9131PyUnicode_Count(PyObject *str,
9132 PyObject *substr,
9133 Py_ssize_t start,
9134 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009136 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009137 PyObject* str_obj;
9138 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 int kind1, kind2, kind;
9140 void *buf1 = NULL, *buf2 = NULL;
9141 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009142
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009143 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009146 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009147 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009148 Py_DECREF(str_obj);
9149 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 }
Tim Petersced69f82003-09-16 20:30:58 +00009151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 kind1 = PyUnicode_KIND(str_obj);
9153 kind2 = PyUnicode_KIND(sub_obj);
9154 kind = kind1 > kind2 ? kind1 : kind2;
9155 buf1 = PyUnicode_DATA(str_obj);
9156 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009157 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 if (!buf1)
9159 goto onError;
9160 buf2 = PyUnicode_DATA(sub_obj);
9161 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009162 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 if (!buf2)
9164 goto onError;
9165 len1 = PyUnicode_GET_LENGTH(str_obj);
9166 len2 = PyUnicode_GET_LENGTH(sub_obj);
9167
9168 ADJUST_INDICES(start, end, len1);
9169 switch(kind) {
9170 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009171 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9172 result = asciilib_count(
9173 ((Py_UCS1*)buf1) + start, end - start,
9174 buf2, len2, PY_SSIZE_T_MAX
9175 );
9176 else
9177 result = ucs1lib_count(
9178 ((Py_UCS1*)buf1) + start, end - start,
9179 buf2, len2, PY_SSIZE_T_MAX
9180 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009181 break;
9182 case PyUnicode_2BYTE_KIND:
9183 result = ucs2lib_count(
9184 ((Py_UCS2*)buf1) + start, end - start,
9185 buf2, len2, PY_SSIZE_T_MAX
9186 );
9187 break;
9188 case PyUnicode_4BYTE_KIND:
9189 result = ucs4lib_count(
9190 ((Py_UCS4*)buf1) + start, end - start,
9191 buf2, len2, PY_SSIZE_T_MAX
9192 );
9193 break;
9194 default:
9195 assert(0); result = 0;
9196 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009197
9198 Py_DECREF(sub_obj);
9199 Py_DECREF(str_obj);
9200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201 if (kind1 != kind)
9202 PyMem_Free(buf1);
9203 if (kind2 != kind)
9204 PyMem_Free(buf2);
9205
Guido van Rossumd57fd912000-03-10 22:53:23 +00009206 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 onError:
9208 Py_DECREF(sub_obj);
9209 Py_DECREF(str_obj);
9210 if (kind1 != kind && buf1)
9211 PyMem_Free(buf1);
9212 if (kind2 != kind && buf2)
9213 PyMem_Free(buf2);
9214 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215}
9216
Alexander Belopolsky40018472011-02-26 01:02:56 +00009217Py_ssize_t
9218PyUnicode_Find(PyObject *str,
9219 PyObject *sub,
9220 Py_ssize_t start,
9221 Py_ssize_t end,
9222 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009224 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009225
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009228 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009229 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009231 Py_DECREF(str);
9232 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233 }
Tim Petersced69f82003-09-16 20:30:58 +00009234
Victor Stinner794d5672011-10-10 03:21:36 +02009235 result = any_find_slice(direction,
9236 str, sub, start, end
9237 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009238
Guido van Rossumd57fd912000-03-10 22:53:23 +00009239 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009240 Py_DECREF(sub);
9241
Guido van Rossumd57fd912000-03-10 22:53:23 +00009242 return result;
9243}
9244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245Py_ssize_t
9246PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9247 Py_ssize_t start, Py_ssize_t end,
9248 int direction)
9249{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009251 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252 if (PyUnicode_READY(str) == -1)
9253 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009254 if (start < 0 || end < 0) {
9255 PyErr_SetString(PyExc_IndexError, "string index out of range");
9256 return -2;
9257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009258 if (end > PyUnicode_GET_LENGTH(str))
9259 end = PyUnicode_GET_LENGTH(str);
9260 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009261 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9262 kind, end-start, ch, direction);
9263 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009265 else
9266 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267}
9268
Alexander Belopolsky40018472011-02-26 01:02:56 +00009269static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009270tailmatch(PyObject *self,
9271 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009272 Py_ssize_t start,
9273 Py_ssize_t end,
9274 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009276 int kind_self;
9277 int kind_sub;
9278 void *data_self;
9279 void *data_sub;
9280 Py_ssize_t offset;
9281 Py_ssize_t i;
9282 Py_ssize_t end_sub;
9283
9284 if (PyUnicode_READY(self) == -1 ||
9285 PyUnicode_READY(substring) == -1)
9286 return 0;
9287
9288 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289 return 1;
9290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9292 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009294 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 kind_self = PyUnicode_KIND(self);
9297 data_self = PyUnicode_DATA(self);
9298 kind_sub = PyUnicode_KIND(substring);
9299 data_sub = PyUnicode_DATA(substring);
9300 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9301
9302 if (direction > 0)
9303 offset = end;
9304 else
9305 offset = start;
9306
9307 if (PyUnicode_READ(kind_self, data_self, offset) ==
9308 PyUnicode_READ(kind_sub, data_sub, 0) &&
9309 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9310 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9311 /* If both are of the same kind, memcmp is sufficient */
9312 if (kind_self == kind_sub) {
9313 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009314 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 data_sub,
9316 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009317 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 }
9319 /* otherwise we have to compare each character by first accesing it */
9320 else {
9321 /* We do not need to compare 0 and len(substring)-1 because
9322 the if statement above ensured already that they are equal
9323 when we end up here. */
9324 // TODO: honor direction and do a forward or backwards search
9325 for (i = 1; i < end_sub; ++i) {
9326 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9327 PyUnicode_READ(kind_sub, data_sub, i))
9328 return 0;
9329 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009330 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332 }
9333
9334 return 0;
9335}
9336
Alexander Belopolsky40018472011-02-26 01:02:56 +00009337Py_ssize_t
9338PyUnicode_Tailmatch(PyObject *str,
9339 PyObject *substr,
9340 Py_ssize_t start,
9341 Py_ssize_t end,
9342 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009344 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009345
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346 str = PyUnicode_FromObject(str);
9347 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009348 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 substr = PyUnicode_FromObject(substr);
9350 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 Py_DECREF(str);
9352 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353 }
Tim Petersced69f82003-09-16 20:30:58 +00009354
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009355 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009356 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357 Py_DECREF(str);
9358 Py_DECREF(substr);
9359 return result;
9360}
9361
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362/* Apply fixfct filter to the Unicode object self and return a
9363 reference to the modified object */
9364
Alexander Belopolsky40018472011-02-26 01:02:56 +00009365static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009366fixup(PyObject *self,
9367 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 PyObject *u;
9370 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371
Victor Stinner87af4f22011-11-21 23:03:47 +01009372 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009373 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009374 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009375 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009377 /* fix functions return the new maximum character in a string,
9378 if the kind of the resulting unicode object does not change,
9379 everything is fine. Otherwise we need to change the string kind
9380 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009381 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 if (maxchar_new == 0)
9383 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9384 else if (maxchar_new <= 127)
9385 maxchar_new = 127;
9386 else if (maxchar_new <= 255)
9387 maxchar_new = 255;
9388 else if (maxchar_new <= 65535)
9389 maxchar_new = 65535;
9390 else
9391 maxchar_new = 1114111; /* 0x10ffff */
9392
9393 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009394 /* fixfct should return TRUE if it modified the buffer. If
9395 FALSE, return a reference to the original buffer instead
9396 (to save space, not time) */
9397 Py_INCREF(self);
9398 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009399 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 else if (maxchar_new == maxchar_old) {
9402 return u;
9403 }
9404 else {
9405 /* In case the maximum character changed, we need to
9406 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009407 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408 if (v == NULL) {
9409 Py_DECREF(u);
9410 return NULL;
9411 }
9412 if (maxchar_new > maxchar_old) {
9413 /* If the maxchar increased so that the kind changed, not all
9414 characters are representable anymore and we need to fix the
9415 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009416 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009417 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9419 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009420 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009421 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423
9424 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009425 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 return v;
9427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009428}
9429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009431fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 /* No need to call PyUnicode_READY(self) because this function is only
9434 called as a callback from fixup() which does it already. */
9435 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9436 const int kind = PyUnicode_KIND(self);
9437 void *data = PyUnicode_DATA(self);
9438 int touched = 0;
9439 Py_UCS4 maxchar = 0;
9440 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009442 for (i = 0; i < len; ++i) {
9443 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9444 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9445 if (up != ch) {
9446 if (up > maxchar)
9447 maxchar = up;
9448 PyUnicode_WRITE(kind, data, i, up);
9449 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009451 else if (ch > maxchar)
9452 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453 }
9454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 if (touched)
9456 return maxchar;
9457 else
9458 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459}
9460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009462fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9465 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9466 const int kind = PyUnicode_KIND(self);
9467 void *data = PyUnicode_DATA(self);
9468 int touched = 0;
9469 Py_UCS4 maxchar = 0;
9470 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 for(i = 0; i < len; ++i) {
9473 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9474 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9475 if (lo != ch) {
9476 if (lo > maxchar)
9477 maxchar = lo;
9478 PyUnicode_WRITE(kind, data, i, lo);
9479 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009480 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 else if (ch > maxchar)
9482 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483 }
9484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485 if (touched)
9486 return maxchar;
9487 else
9488 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009489}
9490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009492fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9495 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9496 const int kind = PyUnicode_KIND(self);
9497 void *data = PyUnicode_DATA(self);
9498 int touched = 0;
9499 Py_UCS4 maxchar = 0;
9500 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 for(i = 0; i < len; ++i) {
9503 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9504 Py_UCS4 nu = 0;
9505
9506 if (Py_UNICODE_ISUPPER(ch))
9507 nu = Py_UNICODE_TOLOWER(ch);
9508 else if (Py_UNICODE_ISLOWER(ch))
9509 nu = Py_UNICODE_TOUPPER(ch);
9510
9511 if (nu != 0) {
9512 if (nu > maxchar)
9513 maxchar = nu;
9514 PyUnicode_WRITE(kind, data, i, nu);
9515 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 else if (ch > maxchar)
9518 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519 }
9520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 if (touched)
9522 return maxchar;
9523 else
9524 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009525}
9526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009528fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9531 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9532 const int kind = PyUnicode_KIND(self);
9533 void *data = PyUnicode_DATA(self);
9534 int touched = 0;
9535 Py_UCS4 maxchar = 0;
9536 Py_ssize_t i = 0;
9537 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009538
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009539 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009540 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541
9542 ch = PyUnicode_READ(kind, data, i);
9543 if (!Py_UNICODE_ISUPPER(ch)) {
9544 maxchar = Py_UNICODE_TOUPPER(ch);
9545 PyUnicode_WRITE(kind, data, i, maxchar);
9546 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548 ++i;
9549 for(; i < len; ++i) {
9550 ch = PyUnicode_READ(kind, data, i);
9551 if (!Py_UNICODE_ISLOWER(ch)) {
9552 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9553 if (lo > maxchar)
9554 maxchar = lo;
9555 PyUnicode_WRITE(kind, data, i, lo);
9556 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 else if (ch > maxchar)
9559 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561
9562 if (touched)
9563 return maxchar;
9564 else
9565 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566}
9567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009569fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9572 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9573 const int kind = PyUnicode_KIND(self);
9574 void *data = PyUnicode_DATA(self);
9575 Py_UCS4 maxchar = 0;
9576 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577 int previous_is_cased;
9578
9579 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 if (len == 1) {
9581 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9582 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9583 if (ti != ch) {
9584 PyUnicode_WRITE(kind, data, i, ti);
9585 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 }
9587 else
9588 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 for(; i < len; ++i) {
9592 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9593 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009594
Benjamin Peterson29060642009-01-31 22:14:21 +00009595 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009597 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 nu = Py_UNICODE_TOTITLE(ch);
9599
9600 if (nu > maxchar)
9601 maxchar = nu;
9602 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009603
Benjamin Peterson29060642009-01-31 22:14:21 +00009604 if (Py_UNICODE_ISLOWER(ch) ||
9605 Py_UNICODE_ISUPPER(ch) ||
9606 Py_UNICODE_ISTITLE(ch))
9607 previous_is_cased = 1;
9608 else
9609 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612}
9613
Tim Peters8ce9f162004-08-27 01:49:32 +00009614PyObject *
9615PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009618 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009620 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009621 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9622 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009623 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009625 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009627 int use_memcpy;
9628 unsigned char *res_data = NULL, *sep_data = NULL;
9629 PyObject *last_obj;
9630 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631
Tim Peters05eba1f2004-08-27 21:32:02 +00009632 fseq = PySequence_Fast(seq, "");
9633 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009634 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009635 }
9636
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009637 /* NOTE: the following code can't call back into Python code,
9638 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009639 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009640
Tim Peters05eba1f2004-08-27 21:32:02 +00009641 seqlen = PySequence_Fast_GET_SIZE(fseq);
9642 /* If empty sequence, return u"". */
9643 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009644 Py_DECREF(fseq);
9645 Py_INCREF(unicode_empty);
9646 res = unicode_empty;
9647 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009648 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009649
Tim Peters05eba1f2004-08-27 21:32:02 +00009650 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009651 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009652 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009653 if (seqlen == 1) {
9654 if (PyUnicode_CheckExact(items[0])) {
9655 res = items[0];
9656 Py_INCREF(res);
9657 Py_DECREF(fseq);
9658 return res;
9659 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009660 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009661 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009662 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009663 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009664 /* Set up sep and seplen */
9665 if (separator == NULL) {
9666 /* fall back to a blank space separator */
9667 sep = PyUnicode_FromOrdinal(' ');
9668 if (!sep)
9669 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009670 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009671 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009672 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009673 else {
9674 if (!PyUnicode_Check(separator)) {
9675 PyErr_Format(PyExc_TypeError,
9676 "separator: expected str instance,"
9677 " %.80s found",
9678 Py_TYPE(separator)->tp_name);
9679 goto onError;
9680 }
9681 if (PyUnicode_READY(separator))
9682 goto onError;
9683 sep = separator;
9684 seplen = PyUnicode_GET_LENGTH(separator);
9685 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9686 /* inc refcount to keep this code path symmetric with the
9687 above case of a blank separator */
9688 Py_INCREF(sep);
9689 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009690 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009691 }
9692
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009693 /* There are at least two things to join, or else we have a subclass
9694 * of str in the sequence.
9695 * Do a pre-pass to figure out the total amount of space we'll
9696 * need (sz), and see whether all argument are strings.
9697 */
9698 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009699#ifdef Py_DEBUG
9700 use_memcpy = 0;
9701#else
9702 use_memcpy = 1;
9703#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009704 for (i = 0; i < seqlen; i++) {
9705 const Py_ssize_t old_sz = sz;
9706 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009707 if (!PyUnicode_Check(item)) {
9708 PyErr_Format(PyExc_TypeError,
9709 "sequence item %zd: expected str instance,"
9710 " %.80s found",
9711 i, Py_TYPE(item)->tp_name);
9712 goto onError;
9713 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 if (PyUnicode_READY(item) == -1)
9715 goto onError;
9716 sz += PyUnicode_GET_LENGTH(item);
9717 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009718 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009719 if (i != 0)
9720 sz += seplen;
9721 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9722 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009723 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009724 goto onError;
9725 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009726 if (use_memcpy && last_obj != NULL) {
9727 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9728 use_memcpy = 0;
9729 }
9730 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009731 }
Tim Petersced69f82003-09-16 20:30:58 +00009732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009734 if (res == NULL)
9735 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009736
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009737 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009738#ifdef Py_DEBUG
9739 use_memcpy = 0;
9740#else
9741 if (use_memcpy) {
9742 res_data = PyUnicode_1BYTE_DATA(res);
9743 kind = PyUnicode_KIND(res);
9744 if (seplen != 0)
9745 sep_data = PyUnicode_1BYTE_DATA(sep);
9746 }
9747#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009749 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009750 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009751 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009752 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009753 if (use_memcpy) {
9754 Py_MEMCPY(res_data,
9755 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009756 kind * seplen);
9757 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009758 }
9759 else {
9760 copy_characters(res, res_offset, sep, 0, seplen);
9761 res_offset += seplen;
9762 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009763 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009764 itemlen = PyUnicode_GET_LENGTH(item);
9765 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009766 if (use_memcpy) {
9767 Py_MEMCPY(res_data,
9768 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009769 kind * itemlen);
9770 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009771 }
9772 else {
9773 copy_characters(res, res_offset, item, 0, itemlen);
9774 res_offset += itemlen;
9775 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009776 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009777 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009778 if (use_memcpy)
9779 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009780 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009781 else
9782 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009783
Tim Peters05eba1f2004-08-27 21:32:02 +00009784 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009786 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788
Benjamin Peterson29060642009-01-31 22:14:21 +00009789 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009790 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009792 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793 return NULL;
9794}
9795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796#define FILL(kind, data, value, start, length) \
9797 do { \
9798 Py_ssize_t i_ = 0; \
9799 assert(kind != PyUnicode_WCHAR_KIND); \
9800 switch ((kind)) { \
9801 case PyUnicode_1BYTE_KIND: { \
9802 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9803 memset(to_, (unsigned char)value, length); \
9804 break; \
9805 } \
9806 case PyUnicode_2BYTE_KIND: { \
9807 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9808 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9809 break; \
9810 } \
9811 default: { \
9812 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9813 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9814 break; \
9815 } \
9816 } \
9817 } while (0)
9818
Victor Stinner9310abb2011-10-05 00:59:23 +02009819static PyObject *
9820pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009821 Py_ssize_t left,
9822 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 PyObject *u;
9826 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009827 int kind;
9828 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829
9830 if (left < 0)
9831 left = 0;
9832 if (right < 0)
9833 right = 0;
9834
Tim Peters7a29bd52001-09-12 03:03:31 +00009835 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836 Py_INCREF(self);
9837 return self;
9838 }
9839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9841 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009842 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9843 return NULL;
9844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9846 if (fill > maxchar)
9847 maxchar = fill;
9848 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009849 if (!u)
9850 return NULL;
9851
9852 kind = PyUnicode_KIND(u);
9853 data = PyUnicode_DATA(u);
9854 if (left)
9855 FILL(kind, data, fill, 0, left);
9856 if (right)
9857 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009858 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009859 assert(_PyUnicode_CheckConsistency(u, 1));
9860 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863
Alexander Belopolsky40018472011-02-26 01:02:56 +00009864PyObject *
9865PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009867 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009868
9869 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009871 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 switch(PyUnicode_KIND(string)) {
9874 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009875 if (PyUnicode_IS_ASCII(string))
9876 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009877 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009878 PyUnicode_GET_LENGTH(string), keepends);
9879 else
9880 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009881 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009882 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 break;
9884 case PyUnicode_2BYTE_KIND:
9885 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009886 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009887 PyUnicode_GET_LENGTH(string), keepends);
9888 break;
9889 case PyUnicode_4BYTE_KIND:
9890 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009891 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 PyUnicode_GET_LENGTH(string), keepends);
9893 break;
9894 default:
9895 assert(0);
9896 list = 0;
9897 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009898 Py_DECREF(string);
9899 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900}
9901
Alexander Belopolsky40018472011-02-26 01:02:56 +00009902static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009903split(PyObject *self,
9904 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009905 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 int kind1, kind2, kind;
9908 void *buf1, *buf2;
9909 Py_ssize_t len1, len2;
9910 PyObject* out;
9911
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009913 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 if (PyUnicode_READY(self) == -1)
9916 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 if (substring == NULL)
9919 switch(PyUnicode_KIND(self)) {
9920 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009921 if (PyUnicode_IS_ASCII(self))
9922 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009923 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009924 PyUnicode_GET_LENGTH(self), maxcount
9925 );
9926 else
9927 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009928 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009929 PyUnicode_GET_LENGTH(self), maxcount
9930 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 case PyUnicode_2BYTE_KIND:
9932 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009933 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 PyUnicode_GET_LENGTH(self), maxcount
9935 );
9936 case PyUnicode_4BYTE_KIND:
9937 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009938 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 PyUnicode_GET_LENGTH(self), maxcount
9940 );
9941 default:
9942 assert(0);
9943 return NULL;
9944 }
9945
9946 if (PyUnicode_READY(substring) == -1)
9947 return NULL;
9948
9949 kind1 = PyUnicode_KIND(self);
9950 kind2 = PyUnicode_KIND(substring);
9951 kind = kind1 > kind2 ? kind1 : kind2;
9952 buf1 = PyUnicode_DATA(self);
9953 buf2 = PyUnicode_DATA(substring);
9954 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009955 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 if (!buf1)
9957 return NULL;
9958 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009959 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 if (!buf2) {
9961 if (kind1 != kind) PyMem_Free(buf1);
9962 return NULL;
9963 }
9964 len1 = PyUnicode_GET_LENGTH(self);
9965 len2 = PyUnicode_GET_LENGTH(substring);
9966
9967 switch(kind) {
9968 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009969 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9970 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009971 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009972 else
9973 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009974 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009975 break;
9976 case PyUnicode_2BYTE_KIND:
9977 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009978 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 break;
9980 case PyUnicode_4BYTE_KIND:
9981 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009982 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 break;
9984 default:
9985 out = NULL;
9986 }
9987 if (kind1 != kind)
9988 PyMem_Free(buf1);
9989 if (kind2 != kind)
9990 PyMem_Free(buf2);
9991 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992}
9993
Alexander Belopolsky40018472011-02-26 01:02:56 +00009994static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009995rsplit(PyObject *self,
9996 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009997 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009998{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 int kind1, kind2, kind;
10000 void *buf1, *buf2;
10001 Py_ssize_t len1, len2;
10002 PyObject* out;
10003
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010004 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010005 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 if (PyUnicode_READY(self) == -1)
10008 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 if (substring == NULL)
10011 switch(PyUnicode_KIND(self)) {
10012 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010013 if (PyUnicode_IS_ASCII(self))
10014 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010015 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010016 PyUnicode_GET_LENGTH(self), maxcount
10017 );
10018 else
10019 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010020 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010021 PyUnicode_GET_LENGTH(self), maxcount
10022 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 case PyUnicode_2BYTE_KIND:
10024 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010025 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 PyUnicode_GET_LENGTH(self), maxcount
10027 );
10028 case PyUnicode_4BYTE_KIND:
10029 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010030 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 PyUnicode_GET_LENGTH(self), maxcount
10032 );
10033 default:
10034 assert(0);
10035 return NULL;
10036 }
10037
10038 if (PyUnicode_READY(substring) == -1)
10039 return NULL;
10040
10041 kind1 = PyUnicode_KIND(self);
10042 kind2 = PyUnicode_KIND(substring);
10043 kind = kind1 > kind2 ? kind1 : kind2;
10044 buf1 = PyUnicode_DATA(self);
10045 buf2 = PyUnicode_DATA(substring);
10046 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010047 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 if (!buf1)
10049 return NULL;
10050 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010051 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 if (!buf2) {
10053 if (kind1 != kind) PyMem_Free(buf1);
10054 return NULL;
10055 }
10056 len1 = PyUnicode_GET_LENGTH(self);
10057 len2 = PyUnicode_GET_LENGTH(substring);
10058
10059 switch(kind) {
10060 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010061 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10062 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010063 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010064 else
10065 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010066 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 break;
10068 case PyUnicode_2BYTE_KIND:
10069 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010070 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 break;
10072 case PyUnicode_4BYTE_KIND:
10073 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010074 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 break;
10076 default:
10077 out = NULL;
10078 }
10079 if (kind1 != kind)
10080 PyMem_Free(buf1);
10081 if (kind2 != kind)
10082 PyMem_Free(buf2);
10083 return out;
10084}
10085
10086static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010087anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10088 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089{
10090 switch(kind) {
10091 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010092 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10093 return asciilib_find(buf1, len1, buf2, len2, offset);
10094 else
10095 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 case PyUnicode_2BYTE_KIND:
10097 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10098 case PyUnicode_4BYTE_KIND:
10099 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10100 }
10101 assert(0);
10102 return -1;
10103}
10104
10105static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010106anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10107 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108{
10109 switch(kind) {
10110 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010111 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10112 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10113 else
10114 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 case PyUnicode_2BYTE_KIND:
10116 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10117 case PyUnicode_4BYTE_KIND:
10118 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10119 }
10120 assert(0);
10121 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010122}
10123
Alexander Belopolsky40018472011-02-26 01:02:56 +000010124static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125replace(PyObject *self, PyObject *str1,
10126 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 PyObject *u;
10129 char *sbuf = PyUnicode_DATA(self);
10130 char *buf1 = PyUnicode_DATA(str1);
10131 char *buf2 = PyUnicode_DATA(str2);
10132 int srelease = 0, release1 = 0, release2 = 0;
10133 int skind = PyUnicode_KIND(self);
10134 int kind1 = PyUnicode_KIND(str1);
10135 int kind2 = PyUnicode_KIND(str2);
10136 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10137 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10138 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010139 int mayshrink;
10140 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141
10142 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010143 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010145 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146
Victor Stinner59de0ee2011-10-07 10:01:28 +020010147 if (str1 == str2)
10148 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 if (skind < kind1)
10150 /* substring too wide to be present */
10151 goto nothing;
10152
Victor Stinner49a0a212011-10-12 23:46:10 +020010153 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10154 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10155 /* Replacing str1 with str2 may cause a maxchar reduction in the
10156 result string. */
10157 mayshrink = (maxchar_str2 < maxchar);
10158 maxchar = Py_MAX(maxchar, maxchar_str2);
10159
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010161 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010162 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010164 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010166 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010167 Py_UCS4 u1, u2;
10168 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010170 if (findchar(sbuf, PyUnicode_KIND(self),
10171 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010172 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010175 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010177 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 rkind = PyUnicode_KIND(u);
10179 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10180 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010181 if (--maxcount < 0)
10182 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010184 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010185 }
10186 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 int rkind = skind;
10188 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 if (kind1 < rkind) {
10191 /* widen substring */
10192 buf1 = _PyUnicode_AsKind(str1, rkind);
10193 if (!buf1) goto error;
10194 release1 = 1;
10195 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010196 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010197 if (i < 0)
10198 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 if (rkind > kind2) {
10200 /* widen replacement */
10201 buf2 = _PyUnicode_AsKind(str2, rkind);
10202 if (!buf2) goto error;
10203 release2 = 1;
10204 }
10205 else if (rkind < kind2) {
10206 /* widen self and buf1 */
10207 rkind = kind2;
10208 if (release1) PyMem_Free(buf1);
10209 sbuf = _PyUnicode_AsKind(self, rkind);
10210 if (!sbuf) goto error;
10211 srelease = 1;
10212 buf1 = _PyUnicode_AsKind(str1, rkind);
10213 if (!buf1) goto error;
10214 release1 = 1;
10215 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010216 u = PyUnicode_New(slen, maxchar);
10217 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010219 assert(PyUnicode_KIND(u) == rkind);
10220 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010221
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010222 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010223 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010224 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010226 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010228
10229 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010230 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010231 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010232 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010233 if (i == -1)
10234 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010235 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010237 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010241 }
10242 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 Py_ssize_t n, i, j, ires;
10244 Py_ssize_t product, new_size;
10245 int rkind = skind;
10246 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010249 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 buf1 = _PyUnicode_AsKind(str1, rkind);
10251 if (!buf1) goto error;
10252 release1 = 1;
10253 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 if (n == 0)
10256 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010258 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 buf2 = _PyUnicode_AsKind(str2, rkind);
10260 if (!buf2) goto error;
10261 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010264 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 rkind = kind2;
10266 sbuf = _PyUnicode_AsKind(self, rkind);
10267 if (!sbuf) goto error;
10268 srelease = 1;
10269 if (release1) PyMem_Free(buf1);
10270 buf1 = _PyUnicode_AsKind(str1, rkind);
10271 if (!buf1) goto error;
10272 release1 = 1;
10273 }
10274 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10275 PyUnicode_GET_LENGTH(str1))); */
10276 product = n * (len2-len1);
10277 if ((product / (len2-len1)) != n) {
10278 PyErr_SetString(PyExc_OverflowError,
10279 "replace string is too long");
10280 goto error;
10281 }
10282 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010283 if (new_size == 0) {
10284 Py_INCREF(unicode_empty);
10285 u = unicode_empty;
10286 goto done;
10287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10289 PyErr_SetString(PyExc_OverflowError,
10290 "replace string is too long");
10291 goto error;
10292 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010293 u = PyUnicode_New(new_size, maxchar);
10294 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010296 assert(PyUnicode_KIND(u) == rkind);
10297 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 ires = i = 0;
10299 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010300 while (n-- > 0) {
10301 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010302 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010303 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010304 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010305 if (j == -1)
10306 break;
10307 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010308 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010309 memcpy(res + rkind * ires,
10310 sbuf + rkind * i,
10311 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010313 }
10314 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010316 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010318 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010324 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010325 memcpy(res + rkind * ires,
10326 sbuf + rkind * i,
10327 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010328 }
10329 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010330 /* interleave */
10331 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010332 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010334 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010336 if (--n <= 0)
10337 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010338 memcpy(res + rkind * ires,
10339 sbuf + rkind * i,
10340 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 ires++;
10342 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010343 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010344 memcpy(res + rkind * ires,
10345 sbuf + rkind * i,
10346 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010347 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010348 }
10349
10350 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010351 unicode_adjust_maxchar(&u);
10352 if (u == NULL)
10353 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010355
10356 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 if (srelease)
10358 PyMem_FREE(sbuf);
10359 if (release1)
10360 PyMem_FREE(buf1);
10361 if (release2)
10362 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010363 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010365
Benjamin Peterson29060642009-01-31 22:14:21 +000010366 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010367 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 if (srelease)
10369 PyMem_FREE(sbuf);
10370 if (release1)
10371 PyMem_FREE(buf1);
10372 if (release2)
10373 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010374 if (PyUnicode_CheckExact(self)) {
10375 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010376 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010377 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010378 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 error:
10380 if (srelease && sbuf)
10381 PyMem_FREE(sbuf);
10382 if (release1 && buf1)
10383 PyMem_FREE(buf1);
10384 if (release2 && buf2)
10385 PyMem_FREE(buf2);
10386 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387}
10388
10389/* --- Unicode Object Methods --------------------------------------------- */
10390
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010391PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010392 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393\n\
10394Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010395characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396
10397static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010398unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010400 return fixup(self, fixtitle);
10401}
10402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010403PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010404 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405\n\
10406Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010407have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408
10409static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010410unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412 return fixup(self, fixcapitalize);
10413}
10414
10415#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010416PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010417 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418\n\
10419Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010420normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421
10422static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010423unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010424{
10425 PyObject *list;
10426 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010427 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429 /* Split into words */
10430 list = split(self, NULL, -1);
10431 if (!list)
10432 return NULL;
10433
10434 /* Capitalize each word */
10435 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010436 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010437 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438 if (item == NULL)
10439 goto onError;
10440 Py_DECREF(PyList_GET_ITEM(list, i));
10441 PyList_SET_ITEM(list, i, item);
10442 }
10443
10444 /* Join the words to form a new string */
10445 item = PyUnicode_Join(NULL, list);
10446
Benjamin Peterson29060642009-01-31 22:14:21 +000010447 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010449 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450}
10451#endif
10452
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010453/* Argument converter. Coerces to a single unicode character */
10454
10455static int
10456convert_uc(PyObject *obj, void *addr)
10457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010459 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010460
Benjamin Peterson14339b62009-01-31 16:36:08 +000010461 uniobj = PyUnicode_FromObject(obj);
10462 if (uniobj == NULL) {
10463 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010464 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010465 return 0;
10466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010468 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010469 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010470 Py_DECREF(uniobj);
10471 return 0;
10472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010474 Py_DECREF(uniobj);
10475 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010476}
10477
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010478PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010479 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010481Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010482done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483
10484static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010485unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010487 Py_ssize_t marg, left;
10488 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 Py_UCS4 fillchar = ' ';
10490
Victor Stinnere9a29352011-10-01 02:14:59 +020010491 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010493
Victor Stinnere9a29352011-10-01 02:14:59 +020010494 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495 return NULL;
10496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010499 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500 }
10501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503 left = marg / 2 + (marg & width & 1);
10504
Victor Stinner9310abb2011-10-05 00:59:23 +020010505 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010506}
10507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508/* This function assumes that str1 and str2 are readied by the caller. */
10509
Marc-André Lemburge5034372000-08-08 08:04:29 +000010510static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010511unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 int kind1, kind2;
10514 void *data1, *data2;
10515 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 kind1 = PyUnicode_KIND(str1);
10518 kind2 = PyUnicode_KIND(str2);
10519 data1 = PyUnicode_DATA(str1);
10520 data2 = PyUnicode_DATA(str2);
10521 len1 = PyUnicode_GET_LENGTH(str1);
10522 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 for (i = 0; i < len1 && i < len2; ++i) {
10525 Py_UCS4 c1, c2;
10526 c1 = PyUnicode_READ(kind1, data1, i);
10527 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010528
10529 if (c1 != c2)
10530 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010531 }
10532
10533 return (len1 < len2) ? -1 : (len1 != len2);
10534}
10535
Alexander Belopolsky40018472011-02-26 01:02:56 +000010536int
10537PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10540 if (PyUnicode_READY(left) == -1 ||
10541 PyUnicode_READY(right) == -1)
10542 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010543 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010545 PyErr_Format(PyExc_TypeError,
10546 "Can't compare %.100s and %.100s",
10547 left->ob_type->tp_name,
10548 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549 return -1;
10550}
10551
Martin v. Löwis5b222132007-06-10 09:51:05 +000010552int
10553PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10554{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 Py_ssize_t i;
10556 int kind;
10557 void *data;
10558 Py_UCS4 chr;
10559
Victor Stinner910337b2011-10-03 03:20:16 +020010560 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 if (PyUnicode_READY(uni) == -1)
10562 return -1;
10563 kind = PyUnicode_KIND(uni);
10564 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010565 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10567 if (chr != str[i])
10568 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010569 /* This check keeps Python strings that end in '\0' from comparing equal
10570 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010572 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010573 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010574 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010575 return 0;
10576}
10577
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010578
Benjamin Peterson29060642009-01-31 22:14:21 +000010579#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010580 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010581
Alexander Belopolsky40018472011-02-26 01:02:56 +000010582PyObject *
10583PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010584{
10585 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010586
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010587 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10588 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 if (PyUnicode_READY(left) == -1 ||
10590 PyUnicode_READY(right) == -1)
10591 return NULL;
10592 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10593 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010594 if (op == Py_EQ) {
10595 Py_INCREF(Py_False);
10596 return Py_False;
10597 }
10598 if (op == Py_NE) {
10599 Py_INCREF(Py_True);
10600 return Py_True;
10601 }
10602 }
10603 if (left == right)
10604 result = 0;
10605 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010606 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010607
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010608 /* Convert the return value to a Boolean */
10609 switch (op) {
10610 case Py_EQ:
10611 v = TEST_COND(result == 0);
10612 break;
10613 case Py_NE:
10614 v = TEST_COND(result != 0);
10615 break;
10616 case Py_LE:
10617 v = TEST_COND(result <= 0);
10618 break;
10619 case Py_GE:
10620 v = TEST_COND(result >= 0);
10621 break;
10622 case Py_LT:
10623 v = TEST_COND(result == -1);
10624 break;
10625 case Py_GT:
10626 v = TEST_COND(result == 1);
10627 break;
10628 default:
10629 PyErr_BadArgument();
10630 return NULL;
10631 }
10632 Py_INCREF(v);
10633 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010634 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010635
Brian Curtindfc80e32011-08-10 20:28:54 -050010636 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010637}
10638
Alexander Belopolsky40018472011-02-26 01:02:56 +000010639int
10640PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010641{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010642 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 int kind1, kind2, kind;
10644 void *buf1, *buf2;
10645 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010646 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010647
10648 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 sub = PyUnicode_FromObject(element);
10650 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010651 PyErr_Format(PyExc_TypeError,
10652 "'in <string>' requires string as left operand, not %s",
10653 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (PyUnicode_READY(sub) == -1)
10657 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010658
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010660 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010661 Py_DECREF(sub);
10662 return -1;
10663 }
10664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 kind1 = PyUnicode_KIND(str);
10666 kind2 = PyUnicode_KIND(sub);
10667 kind = kind1 > kind2 ? kind1 : kind2;
10668 buf1 = PyUnicode_DATA(str);
10669 buf2 = PyUnicode_DATA(sub);
10670 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010671 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (!buf1) {
10673 Py_DECREF(sub);
10674 return -1;
10675 }
10676 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010677 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 if (!buf2) {
10679 Py_DECREF(sub);
10680 if (kind1 != kind) PyMem_Free(buf1);
10681 return -1;
10682 }
10683 len1 = PyUnicode_GET_LENGTH(str);
10684 len2 = PyUnicode_GET_LENGTH(sub);
10685
10686 switch(kind) {
10687 case PyUnicode_1BYTE_KIND:
10688 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10689 break;
10690 case PyUnicode_2BYTE_KIND:
10691 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10692 break;
10693 case PyUnicode_4BYTE_KIND:
10694 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10695 break;
10696 default:
10697 result = -1;
10698 assert(0);
10699 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010700
10701 Py_DECREF(str);
10702 Py_DECREF(sub);
10703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 if (kind1 != kind)
10705 PyMem_Free(buf1);
10706 if (kind2 != kind)
10707 PyMem_Free(buf2);
10708
Guido van Rossum403d68b2000-03-13 15:55:09 +000010709 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010710}
10711
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712/* Concat to string or Unicode object giving a new Unicode object. */
10713
Alexander Belopolsky40018472011-02-26 01:02:56 +000010714PyObject *
10715PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010718 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719
10720 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010723 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010724 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727
10728 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010729 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010730 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010733 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010734 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736 }
10737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010739 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10740 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 w = PyUnicode_New(
10744 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10745 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010747 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010748 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10749 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750 Py_DECREF(u);
10751 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010752 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754
Benjamin Peterson29060642009-01-31 22:14:21 +000010755 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756 Py_XDECREF(u);
10757 Py_XDECREF(v);
10758 return NULL;
10759}
10760
Victor Stinnerb0923652011-10-04 01:17:31 +020010761static void
10762unicode_append_inplace(PyObject **p_left, PyObject *right)
10763{
10764 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010765
10766 assert(PyUnicode_IS_READY(*p_left));
10767 assert(PyUnicode_IS_READY(right));
10768
10769 left_len = PyUnicode_GET_LENGTH(*p_left);
10770 right_len = PyUnicode_GET_LENGTH(right);
10771 if (left_len > PY_SSIZE_T_MAX - right_len) {
10772 PyErr_SetString(PyExc_OverflowError,
10773 "strings are too large to concat");
10774 goto error;
10775 }
10776 new_len = left_len + right_len;
10777
10778 /* Now we own the last reference to 'left', so we can resize it
10779 * in-place.
10780 */
10781 if (unicode_resize(p_left, new_len) != 0) {
10782 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10783 * deallocated so it cannot be put back into
10784 * 'variable'. The MemoryError is raised when there
10785 * is no value in 'variable', which might (very
10786 * remotely) be a cause of incompatibilities.
10787 */
10788 goto error;
10789 }
10790 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010791 copy_characters(*p_left, left_len, right, 0, right_len);
10792 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010793 return;
10794
10795error:
10796 Py_DECREF(*p_left);
10797 *p_left = NULL;
10798}
10799
Walter Dörwald1ab83302007-05-18 17:15:44 +000010800void
Victor Stinner23e56682011-10-03 03:54:37 +020010801PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010802{
Victor Stinner23e56682011-10-03 03:54:37 +020010803 PyObject *left, *res;
10804
10805 if (p_left == NULL) {
10806 if (!PyErr_Occurred())
10807 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010808 return;
10809 }
Victor Stinner23e56682011-10-03 03:54:37 +020010810 left = *p_left;
10811 if (right == NULL || !PyUnicode_Check(left)) {
10812 if (!PyErr_Occurred())
10813 PyErr_BadInternalCall();
10814 goto error;
10815 }
10816
Victor Stinnere1335c72011-10-04 20:53:03 +020010817 if (PyUnicode_READY(left))
10818 goto error;
10819 if (PyUnicode_READY(right))
10820 goto error;
10821
Victor Stinner23e56682011-10-03 03:54:37 +020010822 if (PyUnicode_CheckExact(left) && left != unicode_empty
10823 && PyUnicode_CheckExact(right) && right != unicode_empty
10824 && unicode_resizable(left)
10825 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10826 || _PyUnicode_WSTR(left) != NULL))
10827 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010828 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10829 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010830 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010831 not so different than duplicating the string. */
10832 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010833 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010834 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010835 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010836 return;
10837 }
10838 }
10839
10840 res = PyUnicode_Concat(left, right);
10841 if (res == NULL)
10842 goto error;
10843 Py_DECREF(left);
10844 *p_left = res;
10845 return;
10846
10847error:
10848 Py_DECREF(*p_left);
10849 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010850}
10851
10852void
10853PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10854{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010855 PyUnicode_Append(pleft, right);
10856 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010857}
10858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010859PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010860 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010862Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010863string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010864interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865
10866static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010867unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010869 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010870 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010871 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 int kind1, kind2, kind;
10874 void *buf1, *buf2;
10875 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876
Jesus Ceaac451502011-04-20 17:09:23 +020010877 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10878 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010879 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 kind1 = PyUnicode_KIND(self);
10882 kind2 = PyUnicode_KIND(substring);
10883 kind = kind1 > kind2 ? kind1 : kind2;
10884 buf1 = PyUnicode_DATA(self);
10885 buf2 = PyUnicode_DATA(substring);
10886 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010887 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010888 if (!buf1) {
10889 Py_DECREF(substring);
10890 return NULL;
10891 }
10892 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010893 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (!buf2) {
10895 Py_DECREF(substring);
10896 if (kind1 != kind) PyMem_Free(buf1);
10897 return NULL;
10898 }
10899 len1 = PyUnicode_GET_LENGTH(self);
10900 len2 = PyUnicode_GET_LENGTH(substring);
10901
10902 ADJUST_INDICES(start, end, len1);
10903 switch(kind) {
10904 case PyUnicode_1BYTE_KIND:
10905 iresult = ucs1lib_count(
10906 ((Py_UCS1*)buf1) + start, end - start,
10907 buf2, len2, PY_SSIZE_T_MAX
10908 );
10909 break;
10910 case PyUnicode_2BYTE_KIND:
10911 iresult = ucs2lib_count(
10912 ((Py_UCS2*)buf1) + start, end - start,
10913 buf2, len2, PY_SSIZE_T_MAX
10914 );
10915 break;
10916 case PyUnicode_4BYTE_KIND:
10917 iresult = ucs4lib_count(
10918 ((Py_UCS4*)buf1) + start, end - start,
10919 buf2, len2, PY_SSIZE_T_MAX
10920 );
10921 break;
10922 default:
10923 assert(0); iresult = 0;
10924 }
10925
10926 result = PyLong_FromSsize_t(iresult);
10927
10928 if (kind1 != kind)
10929 PyMem_Free(buf1);
10930 if (kind2 != kind)
10931 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932
10933 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010934
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935 return result;
10936}
10937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010938PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010939 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010941Encode S using the codec registered for encoding. Default encoding\n\
10942is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010943handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010944a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10945'xmlcharrefreplace' as well as any other name registered with\n\
10946codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947
10948static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010949unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010951 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952 char *encoding = NULL;
10953 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010954
Benjamin Peterson308d6372009-09-18 21:42:35 +000010955 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10956 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010958 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010959}
10960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010961PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010962 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963\n\
10964Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010965If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
10967static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010968unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010970 Py_ssize_t i, j, line_pos, src_len, incr;
10971 Py_UCS4 ch;
10972 PyObject *u;
10973 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010975 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010976 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977
10978 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980
Antoine Pitrou22425222011-10-04 19:10:51 +020010981 if (PyUnicode_READY(self) == -1)
10982 return NULL;
10983
Thomas Wouters7e474022000-07-16 12:04:32 +000010984 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010985 src_len = PyUnicode_GET_LENGTH(self);
10986 i = j = line_pos = 0;
10987 kind = PyUnicode_KIND(self);
10988 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010989 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010990 for (; i < src_len; i++) {
10991 ch = PyUnicode_READ(kind, src_data, i);
10992 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010993 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010994 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010995 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010996 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010997 goto overflow;
10998 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011000 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011004 goto overflow;
11005 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011007 if (ch == '\n' || ch == '\r')
11008 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011010 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011011 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010011012 Py_INCREF(self);
11013 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011014 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011015
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011017 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 if (!u)
11019 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011020 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021
Antoine Pitroue71d5742011-10-04 15:55:09 +020011022 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023
Antoine Pitroue71d5742011-10-04 15:55:09 +020011024 for (; i < src_len; i++) {
11025 ch = PyUnicode_READ(kind, src_data, i);
11026 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011028 incr = tabsize - (line_pos % tabsize);
11029 line_pos += incr;
11030 while (incr--) {
11031 PyUnicode_WRITE(kind, dest_data, j, ' ');
11032 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011033 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011035 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011037 line_pos++;
11038 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011039 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011040 if (ch == '\n' || ch == '\r')
11041 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011043 }
11044 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011045 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011046
Antoine Pitroue71d5742011-10-04 15:55:09 +020011047 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011048 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11049 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050}
11051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011052PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011053 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054\n\
11055Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011056such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057arguments start and end are interpreted as in slice notation.\n\
11058\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011059Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060
11061static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011064 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011065 Py_ssize_t start;
11066 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011067 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068
Jesus Ceaac451502011-04-20 17:09:23 +020011069 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11070 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 if (PyUnicode_READY(self) == -1)
11074 return NULL;
11075 if (PyUnicode_READY(substring) == -1)
11076 return NULL;
11077
Victor Stinner7931d9a2011-11-04 00:22:48 +010011078 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079
11080 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 if (result == -2)
11083 return NULL;
11084
Christian Heimes217cfd12007-12-02 14:31:20 +000011085 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011086}
11087
11088static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011089unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011091 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11092 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095}
11096
Guido van Rossumc2504932007-09-18 19:42:40 +000011097/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011098 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011099static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011100unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101{
Guido van Rossumc2504932007-09-18 19:42:40 +000011102 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011103 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 if (_PyUnicode_HASH(self) != -1)
11106 return _PyUnicode_HASH(self);
11107 if (PyUnicode_READY(self) == -1)
11108 return -1;
11109 len = PyUnicode_GET_LENGTH(self);
11110
11111 /* The hash function as a macro, gets expanded three times below. */
11112#define HASH(P) \
11113 x = (Py_uhash_t)*P << 7; \
11114 while (--len >= 0) \
11115 x = (1000003*x) ^ (Py_uhash_t)*P++;
11116
11117 switch (PyUnicode_KIND(self)) {
11118 case PyUnicode_1BYTE_KIND: {
11119 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11120 HASH(c);
11121 break;
11122 }
11123 case PyUnicode_2BYTE_KIND: {
11124 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11125 HASH(s);
11126 break;
11127 }
11128 default: {
11129 Py_UCS4 *l;
11130 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11131 "Impossible switch case in unicode_hash");
11132 l = PyUnicode_4BYTE_DATA(self);
11133 HASH(l);
11134 break;
11135 }
11136 }
11137 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11138
Guido van Rossumc2504932007-09-18 19:42:40 +000011139 if (x == -1)
11140 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011142 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011146PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011147 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011149Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150
11151static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011154 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011155 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011156 Py_ssize_t start;
11157 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158
Jesus Ceaac451502011-04-20 17:09:23 +020011159 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11160 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011161 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 if (PyUnicode_READY(self) == -1)
11164 return NULL;
11165 if (PyUnicode_READY(substring) == -1)
11166 return NULL;
11167
Victor Stinner7931d9a2011-11-04 00:22:48 +010011168 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169
11170 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (result == -2)
11173 return NULL;
11174
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175 if (result < 0) {
11176 PyErr_SetString(PyExc_ValueError, "substring not found");
11177 return NULL;
11178 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179
Christian Heimes217cfd12007-12-02 14:31:20 +000011180 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181}
11182
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011183PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011184 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011185\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011186Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011187at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188
11189static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011190unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 Py_ssize_t i, length;
11193 int kind;
11194 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195 int cased;
11196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197 if (PyUnicode_READY(self) == -1)
11198 return NULL;
11199 length = PyUnicode_GET_LENGTH(self);
11200 kind = PyUnicode_KIND(self);
11201 data = PyUnicode_DATA(self);
11202
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 if (length == 1)
11205 return PyBool_FromLong(
11206 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011208 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011210 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011211
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213 for (i = 0; i < length; i++) {
11214 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011215
Benjamin Peterson29060642009-01-31 22:14:21 +000011216 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11217 return PyBool_FromLong(0);
11218 else if (!cased && Py_UNICODE_ISLOWER(ch))
11219 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011221 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222}
11223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011224PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011225 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011227Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011228at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229
11230static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011231unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011233 Py_ssize_t i, length;
11234 int kind;
11235 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236 int cased;
11237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 if (PyUnicode_READY(self) == -1)
11239 return NULL;
11240 length = PyUnicode_GET_LENGTH(self);
11241 kind = PyUnicode_KIND(self);
11242 data = PyUnicode_DATA(self);
11243
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 if (length == 1)
11246 return PyBool_FromLong(
11247 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011249 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011251 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011252
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 for (i = 0; i < length; i++) {
11255 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011256
Benjamin Peterson29060642009-01-31 22:14:21 +000011257 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11258 return PyBool_FromLong(0);
11259 else if (!cased && Py_UNICODE_ISUPPER(ch))
11260 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011262 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263}
11264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011265PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011268Return True if S is a titlecased string and there is at least one\n\
11269character in S, i.e. upper- and titlecase characters may only\n\
11270follow uncased characters and lowercase characters only cased ones.\n\
11271Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
11273static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011274unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276 Py_ssize_t i, length;
11277 int kind;
11278 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279 int cased, previous_is_cased;
11280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281 if (PyUnicode_READY(self) == -1)
11282 return NULL;
11283 length = PyUnicode_GET_LENGTH(self);
11284 kind = PyUnicode_KIND(self);
11285 data = PyUnicode_DATA(self);
11286
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 if (length == 1) {
11289 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11290 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11291 (Py_UNICODE_ISUPPER(ch) != 0));
11292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011294 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011296 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011297
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298 cased = 0;
11299 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300 for (i = 0; i < length; i++) {
11301 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011302
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11304 if (previous_is_cased)
11305 return PyBool_FromLong(0);
11306 previous_is_cased = 1;
11307 cased = 1;
11308 }
11309 else if (Py_UNICODE_ISLOWER(ch)) {
11310 if (!previous_is_cased)
11311 return PyBool_FromLong(0);
11312 previous_is_cased = 1;
11313 cased = 1;
11314 }
11315 else
11316 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011318 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319}
11320
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011321PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011322 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011324Return True if all characters in S are whitespace\n\
11325and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326
11327static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011328unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011330 Py_ssize_t i, length;
11331 int kind;
11332 void *data;
11333
11334 if (PyUnicode_READY(self) == -1)
11335 return NULL;
11336 length = PyUnicode_GET_LENGTH(self);
11337 kind = PyUnicode_KIND(self);
11338 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 if (length == 1)
11342 return PyBool_FromLong(
11343 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011345 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011347 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 for (i = 0; i < length; i++) {
11350 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011351 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011352 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011354 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355}
11356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011357PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011358 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011359\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011360Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011361and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011362
11363static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011364unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011365{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 Py_ssize_t i, length;
11367 int kind;
11368 void *data;
11369
11370 if (PyUnicode_READY(self) == -1)
11371 return NULL;
11372 length = PyUnicode_GET_LENGTH(self);
11373 kind = PyUnicode_KIND(self);
11374 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011375
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011376 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 if (length == 1)
11378 return PyBool_FromLong(
11379 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011380
11381 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011383 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 for (i = 0; i < length; i++) {
11386 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011387 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011388 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011389 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011390}
11391
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011392PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011393 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011394\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011395Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011396and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011397
11398static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011399unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011400{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 int kind;
11402 void *data;
11403 Py_ssize_t len, i;
11404
11405 if (PyUnicode_READY(self) == -1)
11406 return NULL;
11407
11408 kind = PyUnicode_KIND(self);
11409 data = PyUnicode_DATA(self);
11410 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011411
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011412 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 if (len == 1) {
11414 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11415 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11416 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011417
11418 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011419 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011420 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 for (i = 0; i < len; i++) {
11423 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011424 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011426 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011427 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011428}
11429
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011430PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011431 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011433Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011434False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
11436static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011437unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 Py_ssize_t i, length;
11440 int kind;
11441 void *data;
11442
11443 if (PyUnicode_READY(self) == -1)
11444 return NULL;
11445 length = PyUnicode_GET_LENGTH(self);
11446 kind = PyUnicode_KIND(self);
11447 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 if (length == 1)
11451 return PyBool_FromLong(
11452 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011454 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011456 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 for (i = 0; i < length; i++) {
11459 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011460 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011462 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463}
11464
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011465PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011466 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011468Return True if all characters in S are digits\n\
11469and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470
11471static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011472unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 Py_ssize_t i, length;
11475 int kind;
11476 void *data;
11477
11478 if (PyUnicode_READY(self) == -1)
11479 return NULL;
11480 length = PyUnicode_GET_LENGTH(self);
11481 kind = PyUnicode_KIND(self);
11482 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 if (length == 1) {
11486 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11487 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011490 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011492 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 for (i = 0; i < length; i++) {
11495 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011496 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011498 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499}
11500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011501PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011504Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011505False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
11507static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011508unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 Py_ssize_t i, length;
11511 int kind;
11512 void *data;
11513
11514 if (PyUnicode_READY(self) == -1)
11515 return NULL;
11516 length = PyUnicode_GET_LENGTH(self);
11517 kind = PyUnicode_KIND(self);
11518 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 if (length == 1)
11522 return PyBool_FromLong(
11523 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011525 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011527 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 for (i = 0; i < length; i++) {
11530 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011533 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534}
11535
Martin v. Löwis47383402007-08-15 07:32:56 +000011536int
11537PyUnicode_IsIdentifier(PyObject *self)
11538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 int kind;
11540 void *data;
11541 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011542 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011544 if (PyUnicode_READY(self) == -1) {
11545 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 }
11548
11549 /* Special case for empty strings */
11550 if (PyUnicode_GET_LENGTH(self) == 0)
11551 return 0;
11552 kind = PyUnicode_KIND(self);
11553 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011554
11555 /* PEP 3131 says that the first character must be in
11556 XID_Start and subsequent characters in XID_Continue,
11557 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011558 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011559 letters, digits, underscore). However, given the current
11560 definition of XID_Start and XID_Continue, it is sufficient
11561 to check just for these, except that _ must be allowed
11562 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011564 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011565 return 0;
11566
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011567 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011569 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011570 return 1;
11571}
11572
11573PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011574 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011575\n\
11576Return True if S is a valid identifier according\n\
11577to the language definition.");
11578
11579static PyObject*
11580unicode_isidentifier(PyObject *self)
11581{
11582 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11583}
11584
Georg Brandl559e5d72008-06-11 18:37:52 +000011585PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011587\n\
11588Return True if all characters in S are considered\n\
11589printable in repr() or S is empty, False otherwise.");
11590
11591static PyObject*
11592unicode_isprintable(PyObject *self)
11593{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 Py_ssize_t i, length;
11595 int kind;
11596 void *data;
11597
11598 if (PyUnicode_READY(self) == -1)
11599 return NULL;
11600 length = PyUnicode_GET_LENGTH(self);
11601 kind = PyUnicode_KIND(self);
11602 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011603
11604 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 if (length == 1)
11606 return PyBool_FromLong(
11607 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 for (i = 0; i < length; i++) {
11610 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011611 Py_RETURN_FALSE;
11612 }
11613 }
11614 Py_RETURN_TRUE;
11615}
11616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011617PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011618 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619\n\
11620Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011621iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622
11623static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011624unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011626 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627}
11628
Martin v. Löwis18e16552006-02-15 17:27:45 +000011629static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011630unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (PyUnicode_READY(self) == -1)
11633 return -1;
11634 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635}
11636
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011637PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011640Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011641done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642
11643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011644unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011646 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 Py_UCS4 fillchar = ' ';
11648
11649 if (PyUnicode_READY(self) == -1)
11650 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011651
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011652 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653 return NULL;
11654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011657 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658 }
11659
Victor Stinner7931d9a2011-11-04 00:22:48 +010011660 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661}
11662
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011663PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011664 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011666Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667
11668static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011669unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011671 return fixup(self, fixlower);
11672}
11673
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011674#define LEFTSTRIP 0
11675#define RIGHTSTRIP 1
11676#define BOTHSTRIP 2
11677
11678/* Arrays indexed by above */
11679static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11680
11681#define STRIPNAME(i) (stripformat[i]+3)
11682
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011683/* externally visible for str.strip(unicode) */
11684PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011685_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 void *data;
11688 int kind;
11689 Py_ssize_t i, j, len;
11690 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11693 return NULL;
11694
11695 kind = PyUnicode_KIND(self);
11696 data = PyUnicode_DATA(self);
11697 len = PyUnicode_GET_LENGTH(self);
11698 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11699 PyUnicode_DATA(sepobj),
11700 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011701
Benjamin Peterson14339b62009-01-31 16:36:08 +000011702 i = 0;
11703 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 while (i < len &&
11705 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 i++;
11707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011708 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011709
Benjamin Peterson14339b62009-01-31 16:36:08 +000011710 j = len;
11711 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011712 do {
11713 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 } while (j >= i &&
11715 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011717 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011718
Victor Stinner7931d9a2011-11-04 00:22:48 +010011719 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720}
11721
11722PyObject*
11723PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11724{
11725 unsigned char *data;
11726 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011727 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728
Victor Stinnerde636f32011-10-01 03:55:54 +020011729 if (PyUnicode_READY(self) == -1)
11730 return NULL;
11731
11732 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11733
Victor Stinner12bab6d2011-10-01 01:53:49 +020011734 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011736 if (PyUnicode_CheckExact(self)) {
11737 Py_INCREF(self);
11738 return self;
11739 }
11740 else
11741 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 }
11743
Victor Stinner12bab6d2011-10-01 01:53:49 +020011744 length = end - start;
11745 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011746 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747
Victor Stinnerde636f32011-10-01 03:55:54 +020011748 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011749 PyErr_SetString(PyExc_IndexError, "string index out of range");
11750 return NULL;
11751 }
11752
Victor Stinnerb9275c12011-10-05 14:01:42 +020011753 if (PyUnicode_IS_ASCII(self)) {
11754 kind = PyUnicode_KIND(self);
11755 data = PyUnicode_1BYTE_DATA(self);
11756 return unicode_fromascii(data + start, length);
11757 }
11758 else {
11759 kind = PyUnicode_KIND(self);
11760 data = PyUnicode_1BYTE_DATA(self);
11761 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011762 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011763 length);
11764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
11767static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011768do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 int kind;
11771 void *data;
11772 Py_ssize_t len, i, j;
11773
11774 if (PyUnicode_READY(self) == -1)
11775 return NULL;
11776
11777 kind = PyUnicode_KIND(self);
11778 data = PyUnicode_DATA(self);
11779 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011780
Benjamin Peterson14339b62009-01-31 16:36:08 +000011781 i = 0;
11782 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011784 i++;
11785 }
11786 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787
Benjamin Peterson14339b62009-01-31 16:36:08 +000011788 j = len;
11789 if (striptype != LEFTSTRIP) {
11790 do {
11791 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011793 j++;
11794 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011795
Victor Stinner7931d9a2011-11-04 00:22:48 +010011796 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797}
11798
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011799
11800static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011801do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011802{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011803 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011804
Benjamin Peterson14339b62009-01-31 16:36:08 +000011805 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11806 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807
Benjamin Peterson14339b62009-01-31 16:36:08 +000011808 if (sep != NULL && sep != Py_None) {
11809 if (PyUnicode_Check(sep))
11810 return _PyUnicode_XStrip(self, striptype, sep);
11811 else {
11812 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 "%s arg must be None or str",
11814 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011815 return NULL;
11816 }
11817 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011818
Benjamin Peterson14339b62009-01-31 16:36:08 +000011819 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011820}
11821
11822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011823PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011825\n\
11826Return a copy of the string S with leading and trailing\n\
11827whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011828If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011829
11830static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011831unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011832{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011833 if (PyTuple_GET_SIZE(args) == 0)
11834 return do_strip(self, BOTHSTRIP); /* Common case */
11835 else
11836 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011837}
11838
11839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011840PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011842\n\
11843Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011844If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011845
11846static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011847unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011848{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011849 if (PyTuple_GET_SIZE(args) == 0)
11850 return do_strip(self, LEFTSTRIP); /* Common case */
11851 else
11852 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011853}
11854
11855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011856PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011857 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011858\n\
11859Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011860If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011861
11862static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011863unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011864{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011865 if (PyTuple_GET_SIZE(args) == 0)
11866 return do_strip(self, RIGHTSTRIP); /* Common case */
11867 else
11868 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011869}
11870
11871
Guido van Rossumd57fd912000-03-10 22:53:23 +000011872static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011873unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011875 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877
Georg Brandl222de0f2009-04-12 12:01:50 +000011878 if (len < 1) {
11879 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011880 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011881 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882
Tim Peters7a29bd52001-09-12 03:03:31 +000011883 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884 /* no repeat, return original string */
11885 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011886 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887 }
Tim Peters8f422462000-09-09 06:13:41 +000011888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011889 if (PyUnicode_READY(str) == -1)
11890 return NULL;
11891
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011892 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011893 PyErr_SetString(PyExc_OverflowError,
11894 "repeated string is too long");
11895 return NULL;
11896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011898
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011899 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011900 if (!u)
11901 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011902 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 if (PyUnicode_GET_LENGTH(str) == 1) {
11905 const int kind = PyUnicode_KIND(str);
11906 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11907 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011908 if (kind == PyUnicode_1BYTE_KIND)
11909 memset(to, (unsigned char)fill_char, len);
11910 else {
11911 for (n = 0; n < len; ++n)
11912 PyUnicode_WRITE(kind, to, n, fill_char);
11913 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 }
11915 else {
11916 /* number of characters copied this far */
11917 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011918 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 char *to = (char *) PyUnicode_DATA(u);
11920 Py_MEMCPY(to, PyUnicode_DATA(str),
11921 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011922 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 n = (done <= nchars-done) ? done : nchars-done;
11924 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011925 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011926 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 }
11928
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011929 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011930 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931}
11932
Alexander Belopolsky40018472011-02-26 01:02:56 +000011933PyObject *
11934PyUnicode_Replace(PyObject *obj,
11935 PyObject *subobj,
11936 PyObject *replobj,
11937 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938{
11939 PyObject *self;
11940 PyObject *str1;
11941 PyObject *str2;
11942 PyObject *result;
11943
11944 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011945 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011946 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011948 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011949 Py_DECREF(self);
11950 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951 }
11952 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011953 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 Py_DECREF(self);
11955 Py_DECREF(str1);
11956 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959 Py_DECREF(self);
11960 Py_DECREF(str1);
11961 Py_DECREF(str2);
11962 return result;
11963}
11964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011965PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011966 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967\n\
11968Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011969old replaced by new. If the optional argument count is\n\
11970given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971
11972static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011973unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 PyObject *str1;
11976 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011977 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978 PyObject *result;
11979
Martin v. Löwis18e16552006-02-15 17:27:45 +000011980 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011983 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 str1 = PyUnicode_FromObject(str1);
11985 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11986 return NULL;
11987 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011988 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 Py_DECREF(str1);
11990 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011991 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992
11993 result = replace(self, str1, str2, maxcount);
11994
11995 Py_DECREF(str1);
11996 Py_DECREF(str2);
11997 return result;
11998}
11999
Alexander Belopolsky40018472011-02-26 01:02:56 +000012000static PyObject *
12001unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012003 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 Py_ssize_t isize;
12005 Py_ssize_t osize, squote, dquote, i, o;
12006 Py_UCS4 max, quote;
12007 int ikind, okind;
12008 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012009
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012011 return NULL;
12012
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 isize = PyUnicode_GET_LENGTH(unicode);
12014 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 /* Compute length of output, quote characters, and
12017 maximum character */
12018 osize = 2; /* quotes */
12019 max = 127;
12020 squote = dquote = 0;
12021 ikind = PyUnicode_KIND(unicode);
12022 for (i = 0; i < isize; i++) {
12023 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12024 switch (ch) {
12025 case '\'': squote++; osize++; break;
12026 case '"': dquote++; osize++; break;
12027 case '\\': case '\t': case '\r': case '\n':
12028 osize += 2; break;
12029 default:
12030 /* Fast-path ASCII */
12031 if (ch < ' ' || ch == 0x7f)
12032 osize += 4; /* \xHH */
12033 else if (ch < 0x7f)
12034 osize++;
12035 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12036 osize++;
12037 max = ch > max ? ch : max;
12038 }
12039 else if (ch < 0x100)
12040 osize += 4; /* \xHH */
12041 else if (ch < 0x10000)
12042 osize += 6; /* \uHHHH */
12043 else
12044 osize += 10; /* \uHHHHHHHH */
12045 }
12046 }
12047
12048 quote = '\'';
12049 if (squote) {
12050 if (dquote)
12051 /* Both squote and dquote present. Use squote,
12052 and escape them */
12053 osize += squote;
12054 else
12055 quote = '"';
12056 }
12057
12058 repr = PyUnicode_New(osize, max);
12059 if (repr == NULL)
12060 return NULL;
12061 okind = PyUnicode_KIND(repr);
12062 odata = PyUnicode_DATA(repr);
12063
12064 PyUnicode_WRITE(okind, odata, 0, quote);
12065 PyUnicode_WRITE(okind, odata, osize-1, quote);
12066
12067 for (i = 0, o = 1; i < isize; i++) {
12068 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012069
12070 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 if ((ch == quote) || (ch == '\\')) {
12072 PyUnicode_WRITE(okind, odata, o++, '\\');
12073 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012074 continue;
12075 }
12076
Benjamin Peterson29060642009-01-31 22:14:21 +000012077 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012078 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 PyUnicode_WRITE(okind, odata, o++, '\\');
12080 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012081 }
12082 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 PyUnicode_WRITE(okind, odata, o++, '\\');
12084 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012085 }
12086 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 PyUnicode_WRITE(okind, odata, o++, '\\');
12088 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012089 }
12090
12091 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012092 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 PyUnicode_WRITE(okind, odata, o++, '\\');
12094 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012095 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12096 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012097 }
12098
Georg Brandl559e5d72008-06-11 18:37:52 +000012099 /* Copy ASCII characters as-is */
12100 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012102 }
12103
Benjamin Peterson29060642009-01-31 22:14:21 +000012104 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012105 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012106 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012107 (categories Z* and C* except ASCII space)
12108 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012110 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 if (ch <= 0xff) {
12112 PyUnicode_WRITE(okind, odata, o++, '\\');
12113 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012114 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12115 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012116 }
12117 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 else if (ch >= 0x10000) {
12119 PyUnicode_WRITE(okind, odata, o++, '\\');
12120 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012121 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12124 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12125 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12126 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12127 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12128 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012129 }
12130 /* Map 16-bit characters to '\uxxxx' */
12131 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 PyUnicode_WRITE(okind, odata, o++, '\\');
12133 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012134 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12135 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12136 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12137 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012138 }
12139 }
12140 /* Copy characters as-is */
12141 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012143 }
12144 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012145 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012147 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012148 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149}
12150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012151PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012152 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153\n\
12154Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012155such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156arguments start and end are interpreted as in slice notation.\n\
12157\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012158Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
12160static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012163 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012164 Py_ssize_t start;
12165 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012166 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167
Jesus Ceaac451502011-04-20 17:09:23 +020012168 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12169 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172 if (PyUnicode_READY(self) == -1)
12173 return NULL;
12174 if (PyUnicode_READY(substring) == -1)
12175 return NULL;
12176
Victor Stinner7931d9a2011-11-04 00:22:48 +010012177 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
12179 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 if (result == -2)
12182 return NULL;
12183
Christian Heimes217cfd12007-12-02 14:31:20 +000012184 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185}
12186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012187PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012188 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012190Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191
12192static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012195 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012196 Py_ssize_t start;
12197 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012198 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199
Jesus Ceaac451502011-04-20 17:09:23 +020012200 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12201 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012202 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 if (PyUnicode_READY(self) == -1)
12205 return NULL;
12206 if (PyUnicode_READY(substring) == -1)
12207 return NULL;
12208
Victor Stinner7931d9a2011-11-04 00:22:48 +010012209 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210
12211 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 if (result == -2)
12214 return NULL;
12215
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216 if (result < 0) {
12217 PyErr_SetString(PyExc_ValueError, "substring not found");
12218 return NULL;
12219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220
Christian Heimes217cfd12007-12-02 14:31:20 +000012221 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222}
12223
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012224PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012225 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012227Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012228done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
12230static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012231unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012233 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 Py_UCS4 fillchar = ' ';
12235
Victor Stinnere9a29352011-10-01 02:14:59 +020012236 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012238
Victor Stinnere9a29352011-10-01 02:14:59 +020012239 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240 return NULL;
12241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012244 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245 }
12246
Victor Stinner7931d9a2011-11-04 00:22:48 +010012247 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248}
12249
Alexander Belopolsky40018472011-02-26 01:02:56 +000012250PyObject *
12251PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252{
12253 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012254
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255 s = PyUnicode_FromObject(s);
12256 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012257 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 if (sep != NULL) {
12259 sep = PyUnicode_FromObject(sep);
12260 if (sep == NULL) {
12261 Py_DECREF(s);
12262 return NULL;
12263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264 }
12265
Victor Stinner9310abb2011-10-05 00:59:23 +020012266 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267
12268 Py_DECREF(s);
12269 Py_XDECREF(sep);
12270 return result;
12271}
12272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012273PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012274 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275\n\
12276Return a list of the words in S, using sep as the\n\
12277delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012278splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012279whitespace string is a separator and empty strings are\n\
12280removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281
12282static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012283unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284{
12285 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012286 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287
Martin v. Löwis18e16552006-02-15 17:27:45 +000012288 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289 return NULL;
12290
12291 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012292 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012294 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012296 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297}
12298
Thomas Wouters477c8d52006-05-27 19:21:47 +000012299PyObject *
12300PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12301{
12302 PyObject* str_obj;
12303 PyObject* sep_obj;
12304 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 int kind1, kind2, kind;
12306 void *buf1 = NULL, *buf2 = NULL;
12307 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012308
12309 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012310 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012312 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012313 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012314 Py_DECREF(str_obj);
12315 return NULL;
12316 }
12317
Victor Stinner14f8f022011-10-05 20:58:25 +020012318 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012320 kind = Py_MAX(kind1, kind2);
12321 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012323 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 if (!buf1)
12325 goto onError;
12326 buf2 = PyUnicode_DATA(sep_obj);
12327 if (kind2 != kind)
12328 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12329 if (!buf2)
12330 goto onError;
12331 len1 = PyUnicode_GET_LENGTH(str_obj);
12332 len2 = PyUnicode_GET_LENGTH(sep_obj);
12333
Victor Stinner14f8f022011-10-05 20:58:25 +020012334 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012336 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12337 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12338 else
12339 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 break;
12341 case PyUnicode_2BYTE_KIND:
12342 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12343 break;
12344 case PyUnicode_4BYTE_KIND:
12345 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12346 break;
12347 default:
12348 assert(0);
12349 out = 0;
12350 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012351
12352 Py_DECREF(sep_obj);
12353 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 if (kind1 != kind)
12355 PyMem_Free(buf1);
12356 if (kind2 != kind)
12357 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012358
12359 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 onError:
12361 Py_DECREF(sep_obj);
12362 Py_DECREF(str_obj);
12363 if (kind1 != kind && buf1)
12364 PyMem_Free(buf1);
12365 if (kind2 != kind && buf2)
12366 PyMem_Free(buf2);
12367 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012368}
12369
12370
12371PyObject *
12372PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12373{
12374 PyObject* str_obj;
12375 PyObject* sep_obj;
12376 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 int kind1, kind2, kind;
12378 void *buf1 = NULL, *buf2 = NULL;
12379 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012380
12381 str_obj = PyUnicode_FromObject(str_in);
12382 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012384 sep_obj = PyUnicode_FromObject(sep_in);
12385 if (!sep_obj) {
12386 Py_DECREF(str_obj);
12387 return NULL;
12388 }
12389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 kind1 = PyUnicode_KIND(str_in);
12391 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012392 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 buf1 = PyUnicode_DATA(str_in);
12394 if (kind1 != kind)
12395 buf1 = _PyUnicode_AsKind(str_in, kind);
12396 if (!buf1)
12397 goto onError;
12398 buf2 = PyUnicode_DATA(sep_obj);
12399 if (kind2 != kind)
12400 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12401 if (!buf2)
12402 goto onError;
12403 len1 = PyUnicode_GET_LENGTH(str_obj);
12404 len2 = PyUnicode_GET_LENGTH(sep_obj);
12405
12406 switch(PyUnicode_KIND(str_in)) {
12407 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012408 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12409 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12410 else
12411 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 break;
12413 case PyUnicode_2BYTE_KIND:
12414 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12415 break;
12416 case PyUnicode_4BYTE_KIND:
12417 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12418 break;
12419 default:
12420 assert(0);
12421 out = 0;
12422 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012423
12424 Py_DECREF(sep_obj);
12425 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 if (kind1 != kind)
12427 PyMem_Free(buf1);
12428 if (kind2 != kind)
12429 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012430
12431 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012432 onError:
12433 Py_DECREF(sep_obj);
12434 Py_DECREF(str_obj);
12435 if (kind1 != kind && buf1)
12436 PyMem_Free(buf1);
12437 if (kind2 != kind && buf2)
12438 PyMem_Free(buf2);
12439 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012440}
12441
12442PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012443 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012444\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012445Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012446the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012447found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012448
12449static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012450unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451{
Victor Stinner9310abb2011-10-05 00:59:23 +020012452 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453}
12454
12455PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012456 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012457\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012458Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012459the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012460separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012461
12462static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012463unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012464{
Victor Stinner9310abb2011-10-05 00:59:23 +020012465 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012466}
12467
Alexander Belopolsky40018472011-02-26 01:02:56 +000012468PyObject *
12469PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012470{
12471 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012472
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012473 s = PyUnicode_FromObject(s);
12474 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012475 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012476 if (sep != NULL) {
12477 sep = PyUnicode_FromObject(sep);
12478 if (sep == NULL) {
12479 Py_DECREF(s);
12480 return NULL;
12481 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012482 }
12483
Victor Stinner9310abb2011-10-05 00:59:23 +020012484 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012485
12486 Py_DECREF(s);
12487 Py_XDECREF(sep);
12488 return result;
12489}
12490
12491PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012492 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012493\n\
12494Return a list of the words in S, using sep as the\n\
12495delimiter string, starting at the end of the string and\n\
12496working to the front. If maxsplit is given, at most maxsplit\n\
12497splits are done. If sep is not specified, any whitespace string\n\
12498is a separator.");
12499
12500static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012501unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012502{
12503 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012504 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012505
Martin v. Löwis18e16552006-02-15 17:27:45 +000012506 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012507 return NULL;
12508
12509 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012510 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012511 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012512 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012513 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012514 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012515}
12516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012517PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012518 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519\n\
12520Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012521Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012522is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523
12524static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012525unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012527 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012528 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012530 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12531 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532 return NULL;
12533
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012534 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535}
12536
12537static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012538PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539{
Walter Dörwald346737f2007-05-31 10:44:43 +000012540 if (PyUnicode_CheckExact(self)) {
12541 Py_INCREF(self);
12542 return self;
12543 } else
12544 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012545 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546}
12547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012548PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012549 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550\n\
12551Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012552and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553
12554static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012555unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557 return fixup(self, fixswapcase);
12558}
12559
Georg Brandlceee0772007-11-27 23:48:05 +000012560PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012561 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012562\n\
12563Return a translation table usable for str.translate().\n\
12564If there is only one argument, it must be a dictionary mapping Unicode\n\
12565ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012566Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012567If there are two arguments, they must be strings of equal length, and\n\
12568in the resulting dictionary, each character in x will be mapped to the\n\
12569character at the same position in y. If there is a third argument, it\n\
12570must be a string, whose characters will be mapped to None in the result.");
12571
12572static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012573unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012574{
12575 PyObject *x, *y = NULL, *z = NULL;
12576 PyObject *new = NULL, *key, *value;
12577 Py_ssize_t i = 0;
12578 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012579
Georg Brandlceee0772007-11-27 23:48:05 +000012580 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12581 return NULL;
12582 new = PyDict_New();
12583 if (!new)
12584 return NULL;
12585 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 int x_kind, y_kind, z_kind;
12587 void *x_data, *y_data, *z_data;
12588
Georg Brandlceee0772007-11-27 23:48:05 +000012589 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012590 if (!PyUnicode_Check(x)) {
12591 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12592 "be a string if there is a second argument");
12593 goto err;
12594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012595 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012596 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12597 "arguments must have equal length");
12598 goto err;
12599 }
12600 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 x_kind = PyUnicode_KIND(x);
12602 y_kind = PyUnicode_KIND(y);
12603 x_data = PyUnicode_DATA(x);
12604 y_data = PyUnicode_DATA(y);
12605 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12606 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12607 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012608 if (!key || !value)
12609 goto err;
12610 res = PyDict_SetItem(new, key, value);
12611 Py_DECREF(key);
12612 Py_DECREF(value);
12613 if (res < 0)
12614 goto err;
12615 }
12616 /* create entries for deleting chars in z */
12617 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618 z_kind = PyUnicode_KIND(z);
12619 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012620 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012622 if (!key)
12623 goto err;
12624 res = PyDict_SetItem(new, key, Py_None);
12625 Py_DECREF(key);
12626 if (res < 0)
12627 goto err;
12628 }
12629 }
12630 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 int kind;
12632 void *data;
12633
Georg Brandlceee0772007-11-27 23:48:05 +000012634 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012635 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012636 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12637 "to maketrans it must be a dict");
12638 goto err;
12639 }
12640 /* copy entries into the new dict, converting string keys to int keys */
12641 while (PyDict_Next(x, &i, &key, &value)) {
12642 if (PyUnicode_Check(key)) {
12643 /* convert string keys to integer keys */
12644 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012645 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012646 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12647 "table must be of length 1");
12648 goto err;
12649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 kind = PyUnicode_KIND(key);
12651 data = PyUnicode_DATA(key);
12652 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012653 if (!newkey)
12654 goto err;
12655 res = PyDict_SetItem(new, newkey, value);
12656 Py_DECREF(newkey);
12657 if (res < 0)
12658 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012659 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012660 /* just keep integer keys */
12661 if (PyDict_SetItem(new, key, value) < 0)
12662 goto err;
12663 } else {
12664 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12665 "be strings or integers");
12666 goto err;
12667 }
12668 }
12669 }
12670 return new;
12671 err:
12672 Py_DECREF(new);
12673 return NULL;
12674}
12675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012676PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012677 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678\n\
12679Return a copy of the string S, where all characters have been mapped\n\
12680through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012681Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012682Unmapped characters are left untouched. Characters mapped to None\n\
12683are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684
12685static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689}
12690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012691PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012692 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012694Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695
12696static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012697unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699 return fixup(self, fixupper);
12700}
12701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012702PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012705Pad a numeric string S with zeros on the left, to fill a field\n\
12706of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707
12708static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012709unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012711 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012712 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012713 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 int kind;
12715 void *data;
12716 Py_UCS4 chr;
12717
12718 if (PyUnicode_READY(self) == -1)
12719 return NULL;
12720
Martin v. Löwis18e16552006-02-15 17:27:45 +000012721 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722 return NULL;
12723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012725 if (PyUnicode_CheckExact(self)) {
12726 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012727 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012728 }
12729 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012730 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731 }
12732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734
12735 u = pad(self, fill, 0, '0');
12736
Walter Dörwald068325e2002-04-15 13:36:47 +000012737 if (u == NULL)
12738 return NULL;
12739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 kind = PyUnicode_KIND(u);
12741 data = PyUnicode_DATA(u);
12742 chr = PyUnicode_READ(kind, data, fill);
12743
12744 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746 PyUnicode_WRITE(kind, data, 0, chr);
12747 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748 }
12749
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012750 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012751 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012752}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012753
12754#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012755static PyObject *
12756unicode__decimal2ascii(PyObject *self)
12757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012759}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760#endif
12761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012762PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012765Return True if S starts with the specified prefix, False otherwise.\n\
12766With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012767With optional end, stop comparing S at that position.\n\
12768prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769
12770static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012771unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012773{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012774 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012775 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012776 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012777 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012778 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779
Jesus Ceaac451502011-04-20 17:09:23 +020012780 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012781 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012782 if (PyTuple_Check(subobj)) {
12783 Py_ssize_t i;
12784 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012785 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012786 if (substring == NULL)
12787 return NULL;
12788 result = tailmatch(self, substring, start, end, -1);
12789 Py_DECREF(substring);
12790 if (result) {
12791 Py_RETURN_TRUE;
12792 }
12793 }
12794 /* nothing matched */
12795 Py_RETURN_FALSE;
12796 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012797 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012798 if (substring == NULL) {
12799 if (PyErr_ExceptionMatches(PyExc_TypeError))
12800 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12801 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012803 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012804 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012806 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807}
12808
12809
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012810PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012813Return True if S ends with the specified suffix, False otherwise.\n\
12814With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012815With optional end, stop comparing S at that position.\n\
12816suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817
12818static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012819unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012822 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012823 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012824 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012825 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012826 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827
Jesus Ceaac451502011-04-20 17:09:23 +020012828 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012830 if (PyTuple_Check(subobj)) {
12831 Py_ssize_t i;
12832 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012833 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012834 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012835 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012836 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012837 result = tailmatch(self, substring, start, end, +1);
12838 Py_DECREF(substring);
12839 if (result) {
12840 Py_RETURN_TRUE;
12841 }
12842 }
12843 Py_RETURN_FALSE;
12844 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012845 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012846 if (substring == NULL) {
12847 if (PyErr_ExceptionMatches(PyExc_TypeError))
12848 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12849 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012851 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012852 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012854 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855}
12856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012857#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012858
12859PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012860 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012861\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012862Return a formatted version of S, using substitutions from args and kwargs.\n\
12863The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012864
Eric Smith27bbca62010-11-04 17:06:58 +000012865PyDoc_STRVAR(format_map__doc__,
12866 "S.format_map(mapping) -> str\n\
12867\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012868Return a formatted version of S, using substitutions from mapping.\n\
12869The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012870
Eric Smith4a7d76d2008-05-30 18:10:19 +000012871static PyObject *
12872unicode__format__(PyObject* self, PyObject* args)
12873{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012874 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012875
12876 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12877 return NULL;
12878
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012879 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012880 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012881 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012882}
12883
Eric Smith8c663262007-08-25 02:26:07 +000012884PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012885 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012886\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012887Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012888
12889static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012890unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 Py_ssize_t size;
12893
12894 /* If it's a compact object, account for base structure +
12895 character data. */
12896 if (PyUnicode_IS_COMPACT_ASCII(v))
12897 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12898 else if (PyUnicode_IS_COMPACT(v))
12899 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012900 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012901 else {
12902 /* If it is a two-block object, account for base object, and
12903 for character block if present. */
12904 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012905 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012906 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012907 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012908 }
12909 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012910 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012911 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012912 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012913 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012914 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915
12916 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012917}
12918
12919PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012920 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012921
12922static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012923unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012924{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012925 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926 if (!copy)
12927 return NULL;
12928 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012929}
12930
Guido van Rossumd57fd912000-03-10 22:53:23 +000012931static PyMethodDef unicode_methods[] = {
12932
12933 /* Order is according to common usage: often used methods should
12934 appear first, since lookup is done sequentially. */
12935
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012936 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012937 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12938 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012939 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012940 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12941 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12942 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12943 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12944 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12945 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12946 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012947 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012948 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12949 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12950 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012951 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012952 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12953 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12954 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012955 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012956 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012957 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012958 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012959 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12960 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12961 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12962 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12963 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12964 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12965 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12966 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12967 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12968 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12969 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12970 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12971 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12972 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012973 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012974 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012975 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012976 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012977 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012978 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012979 {"maketrans", (PyCFunction) unicode_maketrans,
12980 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012981 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012982#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012983 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984#endif
12985
12986#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012987 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012988 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012989#endif
12990
Benjamin Peterson14339b62009-01-31 16:36:08 +000012991 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992 {NULL, NULL}
12993};
12994
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012995static PyObject *
12996unicode_mod(PyObject *v, PyObject *w)
12997{
Brian Curtindfc80e32011-08-10 20:28:54 -050012998 if (!PyUnicode_Check(v))
12999 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013000 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013001}
13002
13003static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013004 0, /*nb_add*/
13005 0, /*nb_subtract*/
13006 0, /*nb_multiply*/
13007 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013008};
13009
Guido van Rossumd57fd912000-03-10 22:53:23 +000013010static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013011 (lenfunc) unicode_length, /* sq_length */
13012 PyUnicode_Concat, /* sq_concat */
13013 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13014 (ssizeargfunc) unicode_getitem, /* sq_item */
13015 0, /* sq_slice */
13016 0, /* sq_ass_item */
13017 0, /* sq_ass_slice */
13018 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013019};
13020
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013021static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013022unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 if (PyUnicode_READY(self) == -1)
13025 return NULL;
13026
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013027 if (PyIndex_Check(item)) {
13028 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013029 if (i == -1 && PyErr_Occurred())
13030 return NULL;
13031 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013033 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013034 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013035 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013036 PyObject *result;
13037 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013038 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013039 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013041 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013042 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013043 return NULL;
13044 }
13045
13046 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 return PyUnicode_New(0, 0);
13048 } else if (start == 0 && step == 1 &&
13049 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013050 PyUnicode_CheckExact(self)) {
13051 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013052 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000013053 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013054 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013055 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013056 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013057 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013058 src_kind = PyUnicode_KIND(self);
13059 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013060 if (!PyUnicode_IS_ASCII(self)) {
13061 kind_limit = kind_maxchar_limit(src_kind);
13062 max_char = 0;
13063 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13064 ch = PyUnicode_READ(src_kind, src_data, cur);
13065 if (ch > max_char) {
13066 max_char = ch;
13067 if (max_char >= kind_limit)
13068 break;
13069 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013070 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013071 }
Victor Stinner55c99112011-10-13 01:17:06 +020013072 else
13073 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013074 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013075 if (result == NULL)
13076 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013077 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013078 dest_data = PyUnicode_DATA(result);
13079
13080 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013081 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13082 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013083 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013084 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013085 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013086 } else {
13087 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13088 return NULL;
13089 }
13090}
13091
13092static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013093 (lenfunc)unicode_length, /* mp_length */
13094 (binaryfunc)unicode_subscript, /* mp_subscript */
13095 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013096};
13097
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099/* Helpers for PyUnicode_Format() */
13100
13101static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013102getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013104 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013106 (*p_argidx)++;
13107 if (arglen < 0)
13108 return args;
13109 else
13110 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111 }
13112 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013113 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114 return NULL;
13115}
13116
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013117/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013119static PyObject *
13120formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013122 char *p;
13123 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013125
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126 x = PyFloat_AsDouble(v);
13127 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013128 return NULL;
13129
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013132
Eric Smith0923d1d2009-04-16 20:16:10 +000013133 p = PyOS_double_to_string(x, type, prec,
13134 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013135 if (p == NULL)
13136 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013138 PyMem_Free(p);
13139 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140}
13141
Tim Peters38fd5b62000-09-21 05:43:11 +000013142static PyObject*
13143formatlong(PyObject *val, int flags, int prec, int type)
13144{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013145 char *buf;
13146 int len;
13147 PyObject *str; /* temporary string object. */
13148 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013149
Benjamin Peterson14339b62009-01-31 16:36:08 +000013150 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13151 if (!str)
13152 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013153 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013154 Py_DECREF(str);
13155 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013156}
13157
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013158static Py_UCS4
13159formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013161 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013162 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013164 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013165 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013166 goto onError;
13167 }
13168 else {
13169 /* Integer input truncated to a character */
13170 long x;
13171 x = PyLong_AsLong(v);
13172 if (x == -1 && PyErr_Occurred())
13173 goto onError;
13174
13175 if (x < 0 || x > 0x10ffff) {
13176 PyErr_SetString(PyExc_OverflowError,
13177 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013178 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013179 }
13180
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013181 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013182 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013183
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013185 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013186 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013187 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013188}
13189
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013190static int
13191repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13192{
13193 int r;
13194 assert(count > 0);
13195 assert(PyUnicode_Check(obj));
13196 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013197 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013198 if (repeated == NULL)
13199 return -1;
13200 r = _PyAccu_Accumulate(acc, repeated);
13201 Py_DECREF(repeated);
13202 return r;
13203 }
13204 else {
13205 do {
13206 if (_PyAccu_Accumulate(acc, obj))
13207 return -1;
13208 } while (--count);
13209 return 0;
13210 }
13211}
13212
Alexander Belopolsky40018472011-02-26 01:02:56 +000013213PyObject *
13214PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013216 void *fmt;
13217 int fmtkind;
13218 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013220 int r;
13221 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013224 PyObject *temp = NULL;
13225 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013226 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013227 _PyAccu acc;
13228 static PyObject *plus, *minus, *blank, *zero, *percent;
13229
13230 if (!plus && !(plus = get_latin1_char('+')))
13231 return NULL;
13232 if (!minus && !(minus = get_latin1_char('-')))
13233 return NULL;
13234 if (!blank && !(blank = get_latin1_char(' ')))
13235 return NULL;
13236 if (!zero && !(zero = get_latin1_char('0')))
13237 return NULL;
13238 if (!percent && !(percent = get_latin1_char('%')))
13239 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013240
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 PyErr_BadInternalCall();
13243 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013245 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013246 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013247 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013248 if (_PyAccu_Init(&acc))
13249 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013250 fmt = PyUnicode_DATA(uformat);
13251 fmtkind = PyUnicode_KIND(uformat);
13252 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13253 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013256 arglen = PyTuple_Size(args);
13257 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258 }
13259 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013260 arglen = -1;
13261 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013263 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013264 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013265 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266
13267 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013268 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013269 PyObject *nonfmt;
13270 Py_ssize_t nonfmtpos;
13271 nonfmtpos = fmtpos++;
13272 while (fmtcnt >= 0 &&
13273 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13274 fmtpos++;
13275 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013276 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013277 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013278 if (nonfmt == NULL)
13279 goto onError;
13280 r = _PyAccu_Accumulate(&acc, nonfmt);
13281 Py_DECREF(nonfmt);
13282 if (r)
13283 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013284 }
13285 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013286 /* Got a format specifier */
13287 int flags = 0;
13288 Py_ssize_t width = -1;
13289 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013291 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013292 int isnumok;
13293 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013294 void *pbuf = NULL;
13295 Py_ssize_t pindex, len;
13296 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 fmtpos++;
13299 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13300 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013301 Py_ssize_t keylen;
13302 PyObject *key;
13303 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013304
Benjamin Peterson29060642009-01-31 22:14:21 +000013305 if (dict == NULL) {
13306 PyErr_SetString(PyExc_TypeError,
13307 "format requires a mapping");
13308 goto onError;
13309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013310 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013311 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013312 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 /* Skip over balanced parentheses */
13314 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013315 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013316 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013317 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013318 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013319 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013321 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013322 if (fmtcnt < 0 || pcount > 0) {
13323 PyErr_SetString(PyExc_ValueError,
13324 "incomplete format key");
13325 goto onError;
13326 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013327 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013328 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013329 if (key == NULL)
13330 goto onError;
13331 if (args_owned) {
13332 Py_DECREF(args);
13333 args_owned = 0;
13334 }
13335 args = PyObject_GetItem(dict, key);
13336 Py_DECREF(key);
13337 if (args == NULL) {
13338 goto onError;
13339 }
13340 args_owned = 1;
13341 arglen = -1;
13342 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013343 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013344 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013345 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013346 case '-': flags |= F_LJUST; continue;
13347 case '+': flags |= F_SIGN; continue;
13348 case ' ': flags |= F_BLANK; continue;
13349 case '#': flags |= F_ALT; continue;
13350 case '0': flags |= F_ZERO; continue;
13351 }
13352 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013353 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013354 if (c == '*') {
13355 v = getnextarg(args, arglen, &argidx);
13356 if (v == NULL)
13357 goto onError;
13358 if (!PyLong_Check(v)) {
13359 PyErr_SetString(PyExc_TypeError,
13360 "* wants int");
13361 goto onError;
13362 }
13363 width = PyLong_AsLong(v);
13364 if (width == -1 && PyErr_Occurred())
13365 goto onError;
13366 if (width < 0) {
13367 flags |= F_LJUST;
13368 width = -width;
13369 }
13370 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013371 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013372 }
13373 else if (c >= '0' && c <= '9') {
13374 width = c - '0';
13375 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013376 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 if (c < '0' || c > '9')
13378 break;
13379 if ((width*10) / 10 != width) {
13380 PyErr_SetString(PyExc_ValueError,
13381 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013382 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013383 }
13384 width = width*10 + (c - '0');
13385 }
13386 }
13387 if (c == '.') {
13388 prec = 0;
13389 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013390 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 if (c == '*') {
13392 v = getnextarg(args, arglen, &argidx);
13393 if (v == NULL)
13394 goto onError;
13395 if (!PyLong_Check(v)) {
13396 PyErr_SetString(PyExc_TypeError,
13397 "* wants int");
13398 goto onError;
13399 }
13400 prec = PyLong_AsLong(v);
13401 if (prec == -1 && PyErr_Occurred())
13402 goto onError;
13403 if (prec < 0)
13404 prec = 0;
13405 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013406 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 }
13408 else if (c >= '0' && c <= '9') {
13409 prec = c - '0';
13410 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013411 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 if (c < '0' || c > '9')
13413 break;
13414 if ((prec*10) / 10 != prec) {
13415 PyErr_SetString(PyExc_ValueError,
13416 "prec too big");
13417 goto onError;
13418 }
13419 prec = prec*10 + (c - '0');
13420 }
13421 }
13422 } /* prec */
13423 if (fmtcnt >= 0) {
13424 if (c == 'h' || c == 'l' || c == 'L') {
13425 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013426 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013427 }
13428 }
13429 if (fmtcnt < 0) {
13430 PyErr_SetString(PyExc_ValueError,
13431 "incomplete format");
13432 goto onError;
13433 }
13434 if (c != '%') {
13435 v = getnextarg(args, arglen, &argidx);
13436 if (v == NULL)
13437 goto onError;
13438 }
13439 sign = 0;
13440 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013441 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 switch (c) {
13443
13444 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013445 _PyAccu_Accumulate(&acc, percent);
13446 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013447
13448 case 's':
13449 case 'r':
13450 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013451 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013452 temp = v;
13453 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013454 }
13455 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013456 if (c == 's')
13457 temp = PyObject_Str(v);
13458 else if (c == 'r')
13459 temp = PyObject_Repr(v);
13460 else
13461 temp = PyObject_ASCII(v);
13462 if (temp == NULL)
13463 goto onError;
13464 if (PyUnicode_Check(temp))
13465 /* nothing to do */;
13466 else {
13467 Py_DECREF(temp);
13468 PyErr_SetString(PyExc_TypeError,
13469 "%s argument has non-string str()");
13470 goto onError;
13471 }
13472 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013473 if (PyUnicode_READY(temp) == -1) {
13474 Py_CLEAR(temp);
13475 goto onError;
13476 }
13477 pbuf = PyUnicode_DATA(temp);
13478 kind = PyUnicode_KIND(temp);
13479 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013480 if (prec >= 0 && len > prec)
13481 len = prec;
13482 break;
13483
13484 case 'i':
13485 case 'd':
13486 case 'u':
13487 case 'o':
13488 case 'x':
13489 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013490 isnumok = 0;
13491 if (PyNumber_Check(v)) {
13492 PyObject *iobj=NULL;
13493
13494 if (PyLong_Check(v)) {
13495 iobj = v;
13496 Py_INCREF(iobj);
13497 }
13498 else {
13499 iobj = PyNumber_Long(v);
13500 }
13501 if (iobj!=NULL) {
13502 if (PyLong_Check(iobj)) {
13503 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013504 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013505 Py_DECREF(iobj);
13506 if (!temp)
13507 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013508 if (PyUnicode_READY(temp) == -1) {
13509 Py_CLEAR(temp);
13510 goto onError;
13511 }
13512 pbuf = PyUnicode_DATA(temp);
13513 kind = PyUnicode_KIND(temp);
13514 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013515 sign = 1;
13516 }
13517 else {
13518 Py_DECREF(iobj);
13519 }
13520 }
13521 }
13522 if (!isnumok) {
13523 PyErr_Format(PyExc_TypeError,
13524 "%%%c format: a number is required, "
13525 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13526 goto onError;
13527 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013528 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013529 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013530 fillobj = zero;
13531 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013532 break;
13533
13534 case 'e':
13535 case 'E':
13536 case 'f':
13537 case 'F':
13538 case 'g':
13539 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013540 temp = formatfloat(v, flags, prec, c);
13541 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013543 if (PyUnicode_READY(temp) == -1) {
13544 Py_CLEAR(temp);
13545 goto onError;
13546 }
13547 pbuf = PyUnicode_DATA(temp);
13548 kind = PyUnicode_KIND(temp);
13549 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013550 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013551 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013552 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013553 fillobj = zero;
13554 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013555 break;
13556
13557 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013558 {
13559 Py_UCS4 ch = formatchar(v);
13560 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013561 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013562 temp = _PyUnicode_FromUCS4(&ch, 1);
13563 if (temp == NULL)
13564 goto onError;
13565 pbuf = PyUnicode_DATA(temp);
13566 kind = PyUnicode_KIND(temp);
13567 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013569 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013570
13571 default:
13572 PyErr_Format(PyExc_ValueError,
13573 "unsupported format character '%c' (0x%x) "
13574 "at index %zd",
13575 (31<=c && c<=126) ? (char)c : '?',
13576 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013577 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013578 goto onError;
13579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013580 /* pbuf is initialized here. */
13581 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013582 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013583 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13584 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013586 pindex++;
13587 }
13588 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13589 signobj = plus;
13590 len--;
13591 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013592 }
13593 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013594 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013595 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013596 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 else
13598 sign = 0;
13599 }
13600 if (width < len)
13601 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013602 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013603 if (fill != ' ') {
13604 assert(signobj != NULL);
13605 if (_PyAccu_Accumulate(&acc, signobj))
13606 goto onError;
13607 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 if (width > len)
13609 width--;
13610 }
13611 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013612 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013613 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013614 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013615 second = get_latin1_char(
13616 PyUnicode_READ(kind, pbuf, pindex + 1));
13617 pindex += 2;
13618 if (second == NULL ||
13619 _PyAccu_Accumulate(&acc, zero) ||
13620 _PyAccu_Accumulate(&acc, second))
13621 goto onError;
13622 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013623 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013624 width -= 2;
13625 if (width < 0)
13626 width = 0;
13627 len -= 2;
13628 }
13629 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013630 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013631 if (repeat_accumulate(&acc, fillobj, width - len))
13632 goto onError;
13633 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013634 }
13635 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013636 if (sign) {
13637 assert(signobj != NULL);
13638 if (_PyAccu_Accumulate(&acc, signobj))
13639 goto onError;
13640 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013641 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013642 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13643 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013644 second = get_latin1_char(
13645 PyUnicode_READ(kind, pbuf, pindex + 1));
13646 pindex += 2;
13647 if (second == NULL ||
13648 _PyAccu_Accumulate(&acc, zero) ||
13649 _PyAccu_Accumulate(&acc, second))
13650 goto onError;
13651 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013652 }
13653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013654 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013655 if (temp != NULL) {
13656 assert(pbuf == PyUnicode_DATA(temp));
13657 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013658 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013659 else {
13660 const char *p = (const char *) pbuf;
13661 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013662 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013663 v = PyUnicode_FromKindAndData(kind, p, len);
13664 }
13665 if (v == NULL)
13666 goto onError;
13667 r = _PyAccu_Accumulate(&acc, v);
13668 Py_DECREF(v);
13669 if (r)
13670 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013671 if (width > len && repeat_accumulate(&acc, blank, width - len))
13672 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013673 if (dict && (argidx < arglen) && c != '%') {
13674 PyErr_SetString(PyExc_TypeError,
13675 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013676 goto onError;
13677 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013678 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013679 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013680 } /* until end */
13681 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013682 PyErr_SetString(PyExc_TypeError,
13683 "not all arguments converted during string formatting");
13684 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013685 }
13686
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013687 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013688 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013689 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013690 }
13691 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013692 Py_XDECREF(temp);
13693 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013694 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013695
Benjamin Peterson29060642009-01-31 22:14:21 +000013696 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013697 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013698 Py_XDECREF(temp);
13699 Py_XDECREF(second);
13700 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013701 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013702 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013703 }
13704 return NULL;
13705}
13706
Jeremy Hylton938ace62002-07-17 16:30:39 +000013707static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013708unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13709
Tim Peters6d6c1a32001-08-02 04:15:00 +000013710static PyObject *
13711unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13712{
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013714 static char *kwlist[] = {"object", "encoding", "errors", 0};
13715 char *encoding = NULL;
13716 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013717
Benjamin Peterson14339b62009-01-31 16:36:08 +000013718 if (type != &PyUnicode_Type)
13719 return unicode_subtype_new(type, args, kwds);
13720 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013721 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013722 return NULL;
13723 if (x == NULL)
Victor Stinner7931d9a2011-11-04 00:22:48 +010013724 return PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013725 if (encoding == NULL && errors == NULL)
13726 return PyObject_Str(x);
13727 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013728 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013729}
13730
Guido van Rossume023fe02001-08-30 03:12:59 +000013731static PyObject *
13732unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13733{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013734 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013735 Py_ssize_t length, char_size;
13736 int share_wstr, share_utf8;
13737 unsigned int kind;
13738 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013739
Benjamin Peterson14339b62009-01-31 16:36:08 +000013740 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013741
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013742 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013743 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013744 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013745 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013746 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013747 return NULL;
13748
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013749 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013750 if (self == NULL) {
13751 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013752 return NULL;
13753 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013754 kind = PyUnicode_KIND(unicode);
13755 length = PyUnicode_GET_LENGTH(unicode);
13756
13757 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013758#ifdef Py_DEBUG
13759 _PyUnicode_HASH(self) = -1;
13760#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013761 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013762#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013763 _PyUnicode_STATE(self).interned = 0;
13764 _PyUnicode_STATE(self).kind = kind;
13765 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013766 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013767 _PyUnicode_STATE(self).ready = 1;
13768 _PyUnicode_WSTR(self) = NULL;
13769 _PyUnicode_UTF8_LENGTH(self) = 0;
13770 _PyUnicode_UTF8(self) = NULL;
13771 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013772 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013773
13774 share_utf8 = 0;
13775 share_wstr = 0;
13776 if (kind == PyUnicode_1BYTE_KIND) {
13777 char_size = 1;
13778 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13779 share_utf8 = 1;
13780 }
13781 else if (kind == PyUnicode_2BYTE_KIND) {
13782 char_size = 2;
13783 if (sizeof(wchar_t) == 2)
13784 share_wstr = 1;
13785 }
13786 else {
13787 assert(kind == PyUnicode_4BYTE_KIND);
13788 char_size = 4;
13789 if (sizeof(wchar_t) == 4)
13790 share_wstr = 1;
13791 }
13792
13793 /* Ensure we won't overflow the length. */
13794 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13795 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013796 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013797 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013798 data = PyObject_MALLOC((length + 1) * char_size);
13799 if (data == NULL) {
13800 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013801 goto onError;
13802 }
13803
Victor Stinnerc3c74152011-10-02 20:39:55 +020013804 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013805 if (share_utf8) {
13806 _PyUnicode_UTF8_LENGTH(self) = length;
13807 _PyUnicode_UTF8(self) = data;
13808 }
13809 if (share_wstr) {
13810 _PyUnicode_WSTR_LENGTH(self) = length;
13811 _PyUnicode_WSTR(self) = (wchar_t *)data;
13812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013813
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013814 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013815 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013816 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013817#ifdef Py_DEBUG
13818 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13819#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013820 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013821 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013822
13823onError:
13824 Py_DECREF(unicode);
13825 Py_DECREF(self);
13826 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013827}
13828
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013829PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013830 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013831\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013832Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013833encoding defaults to the current default string encoding.\n\
13834errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013835
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013836static PyObject *unicode_iter(PyObject *seq);
13837
Guido van Rossumd57fd912000-03-10 22:53:23 +000013838PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013839 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013840 "str", /* tp_name */
13841 sizeof(PyUnicodeObject), /* tp_size */
13842 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013843 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013844 (destructor)unicode_dealloc, /* tp_dealloc */
13845 0, /* tp_print */
13846 0, /* tp_getattr */
13847 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013848 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013849 unicode_repr, /* tp_repr */
13850 &unicode_as_number, /* tp_as_number */
13851 &unicode_as_sequence, /* tp_as_sequence */
13852 &unicode_as_mapping, /* tp_as_mapping */
13853 (hashfunc) unicode_hash, /* tp_hash*/
13854 0, /* tp_call*/
13855 (reprfunc) unicode_str, /* tp_str */
13856 PyObject_GenericGetAttr, /* tp_getattro */
13857 0, /* tp_setattro */
13858 0, /* tp_as_buffer */
13859 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013860 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013861 unicode_doc, /* tp_doc */
13862 0, /* tp_traverse */
13863 0, /* tp_clear */
13864 PyUnicode_RichCompare, /* tp_richcompare */
13865 0, /* tp_weaklistoffset */
13866 unicode_iter, /* tp_iter */
13867 0, /* tp_iternext */
13868 unicode_methods, /* tp_methods */
13869 0, /* tp_members */
13870 0, /* tp_getset */
13871 &PyBaseObject_Type, /* tp_base */
13872 0, /* tp_dict */
13873 0, /* tp_descr_get */
13874 0, /* tp_descr_set */
13875 0, /* tp_dictoffset */
13876 0, /* tp_init */
13877 0, /* tp_alloc */
13878 unicode_new, /* tp_new */
13879 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013880};
13881
13882/* Initialize the Unicode implementation */
13883
Victor Stinner3a50e702011-10-18 21:21:00 +020013884int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013885{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013886 int i;
13887
Thomas Wouters477c8d52006-05-27 19:21:47 +000013888 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013889 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013890 0x000A, /* LINE FEED */
13891 0x000D, /* CARRIAGE RETURN */
13892 0x001C, /* FILE SEPARATOR */
13893 0x001D, /* GROUP SEPARATOR */
13894 0x001E, /* RECORD SEPARATOR */
13895 0x0085, /* NEXT LINE */
13896 0x2028, /* LINE SEPARATOR */
13897 0x2029, /* PARAGRAPH SEPARATOR */
13898 };
13899
Fred Drakee4315f52000-05-09 19:53:39 +000013900 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013901 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013902 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013903 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013904 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013905
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013906 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013907 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013908 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013909 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013910
13911 /* initialize the linebreak bloom filter */
13912 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013913 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013914 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013915
13916 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013917
13918#ifdef HAVE_MBCS
13919 winver.dwOSVersionInfoSize = sizeof(winver);
13920 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13921 PyErr_SetFromWindowsErr(0);
13922 return -1;
13923 }
13924#endif
13925 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013926}
13927
13928/* Finalize the Unicode implementation */
13929
Christian Heimesa156e092008-02-16 07:38:31 +000013930int
13931PyUnicode_ClearFreeList(void)
13932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013933 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013934}
13935
Guido van Rossumd57fd912000-03-10 22:53:23 +000013936void
Thomas Wouters78890102000-07-22 19:25:51 +000013937_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013938{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013939 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013940
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013941 Py_XDECREF(unicode_empty);
13942 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013943
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013944 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013945 if (unicode_latin1[i]) {
13946 Py_DECREF(unicode_latin1[i]);
13947 unicode_latin1[i] = NULL;
13948 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013949 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013950 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013951 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013952}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013953
Walter Dörwald16807132007-05-25 13:52:07 +000013954void
13955PyUnicode_InternInPlace(PyObject **p)
13956{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013957 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013959#ifdef Py_DEBUG
13960 assert(s != NULL);
13961 assert(_PyUnicode_CHECK(s));
13962#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013963 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013964 return;
13965#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 /* If it's a subclass, we don't really know what putting
13967 it in the interned dict might do. */
13968 if (!PyUnicode_CheckExact(s))
13969 return;
13970 if (PyUnicode_CHECK_INTERNED(s))
13971 return;
13972 if (interned == NULL) {
13973 interned = PyDict_New();
13974 if (interned == NULL) {
13975 PyErr_Clear(); /* Don't leave an exception */
13976 return;
13977 }
13978 }
13979 /* It might be that the GetItem call fails even
13980 though the key is present in the dictionary,
13981 namely when this happens during a stack overflow. */
13982 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013983 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013984 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013985
Benjamin Peterson29060642009-01-31 22:14:21 +000013986 if (t) {
13987 Py_INCREF(t);
13988 Py_DECREF(*p);
13989 *p = t;
13990 return;
13991 }
Walter Dörwald16807132007-05-25 13:52:07 +000013992
Benjamin Peterson14339b62009-01-31 16:36:08 +000013993 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013994 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013995 PyErr_Clear();
13996 PyThreadState_GET()->recursion_critical = 0;
13997 return;
13998 }
13999 PyThreadState_GET()->recursion_critical = 0;
14000 /* The two references in interned are not counted by refcnt.
14001 The deallocator will take care of this */
14002 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014003 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014004}
14005
14006void
14007PyUnicode_InternImmortal(PyObject **p)
14008{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014009 PyUnicode_InternInPlace(p);
14010 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014011 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014012 Py_INCREF(*p);
14013 }
Walter Dörwald16807132007-05-25 13:52:07 +000014014}
14015
14016PyObject *
14017PyUnicode_InternFromString(const char *cp)
14018{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014019 PyObject *s = PyUnicode_FromString(cp);
14020 if (s == NULL)
14021 return NULL;
14022 PyUnicode_InternInPlace(&s);
14023 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014024}
14025
Alexander Belopolsky40018472011-02-26 01:02:56 +000014026void
14027_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014028{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014030 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014031 Py_ssize_t i, n;
14032 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014033
Benjamin Peterson14339b62009-01-31 16:36:08 +000014034 if (interned == NULL || !PyDict_Check(interned))
14035 return;
14036 keys = PyDict_Keys(interned);
14037 if (keys == NULL || !PyList_Check(keys)) {
14038 PyErr_Clear();
14039 return;
14040 }
Walter Dörwald16807132007-05-25 13:52:07 +000014041
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14043 detector, interned unicode strings are not forcibly deallocated;
14044 rather, we give them their stolen references back, and then clear
14045 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014046
Benjamin Peterson14339b62009-01-31 16:36:08 +000014047 n = PyList_GET_SIZE(keys);
14048 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014049 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014050 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014051 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014052 if (PyUnicode_READY(s) == -1) {
14053 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014054 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014056 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014057 case SSTATE_NOT_INTERNED:
14058 /* XXX Shouldn't happen */
14059 break;
14060 case SSTATE_INTERNED_IMMORTAL:
14061 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014062 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014063 break;
14064 case SSTATE_INTERNED_MORTAL:
14065 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014066 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014067 break;
14068 default:
14069 Py_FatalError("Inconsistent interned string state.");
14070 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014071 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014072 }
14073 fprintf(stderr, "total size of all interned strings: "
14074 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14075 "mortal/immortal\n", mortal_size, immortal_size);
14076 Py_DECREF(keys);
14077 PyDict_Clear(interned);
14078 Py_DECREF(interned);
14079 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014080}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014081
14082
14083/********************* Unicode Iterator **************************/
14084
14085typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014086 PyObject_HEAD
14087 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014088 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014089} unicodeiterobject;
14090
14091static void
14092unicodeiter_dealloc(unicodeiterobject *it)
14093{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014094 _PyObject_GC_UNTRACK(it);
14095 Py_XDECREF(it->it_seq);
14096 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014097}
14098
14099static int
14100unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14101{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014102 Py_VISIT(it->it_seq);
14103 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014104}
14105
14106static PyObject *
14107unicodeiter_next(unicodeiterobject *it)
14108{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014109 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014110
Benjamin Peterson14339b62009-01-31 16:36:08 +000014111 assert(it != NULL);
14112 seq = it->it_seq;
14113 if (seq == NULL)
14114 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014115 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014117 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14118 int kind = PyUnicode_KIND(seq);
14119 void *data = PyUnicode_DATA(seq);
14120 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14121 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014122 if (item != NULL)
14123 ++it->it_index;
14124 return item;
14125 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014126
Benjamin Peterson14339b62009-01-31 16:36:08 +000014127 Py_DECREF(seq);
14128 it->it_seq = NULL;
14129 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014130}
14131
14132static PyObject *
14133unicodeiter_len(unicodeiterobject *it)
14134{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014135 Py_ssize_t len = 0;
14136 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014137 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014138 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014139}
14140
14141PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14142
14143static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014144 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014145 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014146 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014147};
14148
14149PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014150 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14151 "str_iterator", /* tp_name */
14152 sizeof(unicodeiterobject), /* tp_basicsize */
14153 0, /* tp_itemsize */
14154 /* methods */
14155 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14156 0, /* tp_print */
14157 0, /* tp_getattr */
14158 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014159 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014160 0, /* tp_repr */
14161 0, /* tp_as_number */
14162 0, /* tp_as_sequence */
14163 0, /* tp_as_mapping */
14164 0, /* tp_hash */
14165 0, /* tp_call */
14166 0, /* tp_str */
14167 PyObject_GenericGetAttr, /* tp_getattro */
14168 0, /* tp_setattro */
14169 0, /* tp_as_buffer */
14170 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14171 0, /* tp_doc */
14172 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14173 0, /* tp_clear */
14174 0, /* tp_richcompare */
14175 0, /* tp_weaklistoffset */
14176 PyObject_SelfIter, /* tp_iter */
14177 (iternextfunc)unicodeiter_next, /* tp_iternext */
14178 unicodeiter_methods, /* tp_methods */
14179 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014180};
14181
14182static PyObject *
14183unicode_iter(PyObject *seq)
14184{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014185 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014186
Benjamin Peterson14339b62009-01-31 16:36:08 +000014187 if (!PyUnicode_Check(seq)) {
14188 PyErr_BadInternalCall();
14189 return NULL;
14190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014191 if (PyUnicode_READY(seq) == -1)
14192 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014193 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14194 if (it == NULL)
14195 return NULL;
14196 it->it_index = 0;
14197 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014198 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014199 _PyObject_GC_TRACK(it);
14200 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014201}
14202
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014203
14204size_t
14205Py_UNICODE_strlen(const Py_UNICODE *u)
14206{
14207 int res = 0;
14208 while(*u++)
14209 res++;
14210 return res;
14211}
14212
14213Py_UNICODE*
14214Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14215{
14216 Py_UNICODE *u = s1;
14217 while ((*u++ = *s2++));
14218 return s1;
14219}
14220
14221Py_UNICODE*
14222Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14223{
14224 Py_UNICODE *u = s1;
14225 while ((*u++ = *s2++))
14226 if (n-- == 0)
14227 break;
14228 return s1;
14229}
14230
14231Py_UNICODE*
14232Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14233{
14234 Py_UNICODE *u1 = s1;
14235 u1 += Py_UNICODE_strlen(u1);
14236 Py_UNICODE_strcpy(u1, s2);
14237 return s1;
14238}
14239
14240int
14241Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14242{
14243 while (*s1 && *s2 && *s1 == *s2)
14244 s1++, s2++;
14245 if (*s1 && *s2)
14246 return (*s1 < *s2) ? -1 : +1;
14247 if (*s1)
14248 return 1;
14249 if (*s2)
14250 return -1;
14251 return 0;
14252}
14253
14254int
14255Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14256{
14257 register Py_UNICODE u1, u2;
14258 for (; n != 0; n--) {
14259 u1 = *s1;
14260 u2 = *s2;
14261 if (u1 != u2)
14262 return (u1 < u2) ? -1 : +1;
14263 if (u1 == '\0')
14264 return 0;
14265 s1++;
14266 s2++;
14267 }
14268 return 0;
14269}
14270
14271Py_UNICODE*
14272Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14273{
14274 const Py_UNICODE *p;
14275 for (p = s; *p; p++)
14276 if (*p == c)
14277 return (Py_UNICODE*)p;
14278 return NULL;
14279}
14280
14281Py_UNICODE*
14282Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14283{
14284 const Py_UNICODE *p;
14285 p = s + Py_UNICODE_strlen(s);
14286 while (p != s) {
14287 p--;
14288 if (*p == c)
14289 return (Py_UNICODE*)p;
14290 }
14291 return NULL;
14292}
Victor Stinner331ea922010-08-10 16:37:20 +000014293
Victor Stinner71133ff2010-09-01 23:43:53 +000014294Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014295PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014296{
Victor Stinner577db2c2011-10-11 22:12:48 +020014297 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014298 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014300 if (!PyUnicode_Check(unicode)) {
14301 PyErr_BadArgument();
14302 return NULL;
14303 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014304 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014305 if (u == NULL)
14306 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014307 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014308 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014309 PyErr_NoMemory();
14310 return NULL;
14311 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014312 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014313 size *= sizeof(Py_UNICODE);
14314 copy = PyMem_Malloc(size);
14315 if (copy == NULL) {
14316 PyErr_NoMemory();
14317 return NULL;
14318 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014319 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014320 return copy;
14321}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014322
Georg Brandl66c221e2010-10-14 07:04:07 +000014323/* A _string module, to export formatter_parser and formatter_field_name_split
14324 to the string.Formatter class implemented in Python. */
14325
14326static PyMethodDef _string_methods[] = {
14327 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14328 METH_O, PyDoc_STR("split the argument as a field name")},
14329 {"formatter_parser", (PyCFunction) formatter_parser,
14330 METH_O, PyDoc_STR("parse the argument as a format string")},
14331 {NULL, NULL}
14332};
14333
14334static struct PyModuleDef _string_module = {
14335 PyModuleDef_HEAD_INIT,
14336 "_string",
14337 PyDoc_STR("string helper module"),
14338 0,
14339 _string_methods,
14340 NULL,
14341 NULL,
14342 NULL,
14343 NULL
14344};
14345
14346PyMODINIT_FUNC
14347PyInit__string(void)
14348{
14349 return PyModule_Create(&_string_module);
14350}
14351
14352
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014353#ifdef __cplusplus
14354}
14355#endif